From 9e3c08db40b8916968b9f30096c7be3f00ce9647 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:44:51 +0200 Subject: Adding upstream version 1:115.7.0. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/x86/cdef.h | 87 + third_party/dav1d/src/x86/cdef16_avx2.asm | 877 ++ third_party/dav1d/src/x86/cdef16_avx512.asm | 622 ++ third_party/dav1d/src/x86/cdef16_sse.asm | 1033 +++ third_party/dav1d/src/x86/cdef_avx2.asm | 1772 ++++ third_party/dav1d/src/x86/cdef_avx512.asm | 860 ++ third_party/dav1d/src/x86/cdef_sse.asm | 1357 +++ third_party/dav1d/src/x86/cpu.c | 100 + third_party/dav1d/src/x86/cpu.h | 44 + third_party/dav1d/src/x86/cpuid.asm | 55 + third_party/dav1d/src/x86/filmgrain.h | 81 + third_party/dav1d/src/x86/filmgrain16_avx2.asm | 2248 +++++ third_party/dav1d/src/x86/filmgrain16_avx512.asm | 932 ++ third_party/dav1d/src/x86/filmgrain16_sse.asm | 3421 +++++++ third_party/dav1d/src/x86/filmgrain_avx2.asm | 2107 +++++ third_party/dav1d/src/x86/filmgrain_avx512.asm | 813 ++ third_party/dav1d/src/x86/filmgrain_common.asm | 46 + third_party/dav1d/src/x86/filmgrain_sse.asm | 3233 +++++++ third_party/dav1d/src/x86/ipred.h | 151 + third_party/dav1d/src/x86/ipred16_avx2.asm | 4992 ++++++++++ third_party/dav1d/src/x86/ipred16_avx512.asm | 833 ++ third_party/dav1d/src/x86/ipred16_sse.asm | 1923 ++++ third_party/dav1d/src/x86/ipred_avx2.asm | 5387 +++++++++++ third_party/dav1d/src/x86/ipred_avx512.asm | 1432 +++ third_party/dav1d/src/x86/ipred_sse.asm | 5409 +++++++++++ third_party/dav1d/src/x86/itx.h | 363 + third_party/dav1d/src/x86/itx16_avx2.asm | 8599 ++++++++++++++++++ third_party/dav1d/src/x86/itx16_avx512.asm | 4133 +++++++++ third_party/dav1d/src/x86/itx16_sse.asm | 8135 +++++++++++++++++ third_party/dav1d/src/x86/itx_avx2.asm | 5542 +++++++++++ third_party/dav1d/src/x86/itx_avx512.asm | 7389 +++++++++++++++ third_party/dav1d/src/x86/itx_sse.asm | 6533 +++++++++++++ third_party/dav1d/src/x86/loopfilter.h | 66 + third_party/dav1d/src/x86/loopfilter16_avx2.asm | 1161 +++ third_party/dav1d/src/x86/loopfilter16_avx512.asm | 912 ++ third_party/dav1d/src/x86/loopfilter16_sse.asm | 1793 ++++ third_party/dav1d/src/x86/loopfilter_avx2.asm | 1569 ++++ third_party/dav1d/src/x86/loopfilter_avx512.asm | 1534 ++++ third_party/dav1d/src/x86/loopfilter_sse.asm | 2348 +++++ third_party/dav1d/src/x86/looprestoration.h | 94 + .../dav1d/src/x86/looprestoration16_avx2.asm | 2540 ++++++ .../dav1d/src/x86/looprestoration16_avx512.asm | 2524 +++++ .../dav1d/src/x86/looprestoration16_sse.asm | 3723 ++++++++ third_party/dav1d/src/x86/looprestoration_avx2.asm | 2237 +++++ .../dav1d/src/x86/looprestoration_avx512.asm | 2122 +++++ third_party/dav1d/src/x86/looprestoration_sse.asm | 3681 ++++++++ third_party/dav1d/src/x86/mc.h | 299 + third_party/dav1d/src/x86/mc16_avx2.asm | 5879 ++++++++++++ third_party/dav1d/src/x86/mc16_avx512.asm | 4858 ++++++++++ third_party/dav1d/src/x86/mc16_sse.asm | 8731 ++++++++++++++++++ third_party/dav1d/src/x86/mc_avx2.asm | 5669 ++++++++++++ third_party/dav1d/src/x86/mc_avx512.asm | 4538 +++++++++ third_party/dav1d/src/x86/mc_sse.asm | 9599 ++++++++++++++++++++ third_party/dav1d/src/x86/msac.asm | 667 ++ third_party/dav1d/src/x86/msac.h | 75 + third_party/dav1d/src/x86/refmvs.asm | 688 ++ third_party/dav1d/src/x86/refmvs.h | 61 + 57 files changed, 147877 insertions(+) create mode 100644 third_party/dav1d/src/x86/cdef.h create mode 100644 third_party/dav1d/src/x86/cdef16_avx2.asm create mode 100644 third_party/dav1d/src/x86/cdef16_avx512.asm create mode 100644 third_party/dav1d/src/x86/cdef16_sse.asm create mode 100644 third_party/dav1d/src/x86/cdef_avx2.asm create mode 100644 third_party/dav1d/src/x86/cdef_avx512.asm create mode 100644 third_party/dav1d/src/x86/cdef_sse.asm create mode 100644 third_party/dav1d/src/x86/cpu.c create mode 100644 third_party/dav1d/src/x86/cpu.h create mode 100644 third_party/dav1d/src/x86/cpuid.asm create mode 100644 third_party/dav1d/src/x86/filmgrain.h create mode 100644 third_party/dav1d/src/x86/filmgrain16_avx2.asm create mode 100644 third_party/dav1d/src/x86/filmgrain16_avx512.asm create mode 100644 third_party/dav1d/src/x86/filmgrain16_sse.asm create mode 100644 third_party/dav1d/src/x86/filmgrain_avx2.asm create mode 100644 third_party/dav1d/src/x86/filmgrain_avx512.asm create mode 100644 third_party/dav1d/src/x86/filmgrain_common.asm create mode 100644 third_party/dav1d/src/x86/filmgrain_sse.asm create mode 100644 third_party/dav1d/src/x86/ipred.h create mode 100644 third_party/dav1d/src/x86/ipred16_avx2.asm create mode 100644 third_party/dav1d/src/x86/ipred16_avx512.asm create mode 100644 third_party/dav1d/src/x86/ipred16_sse.asm create mode 100644 third_party/dav1d/src/x86/ipred_avx2.asm create mode 100644 third_party/dav1d/src/x86/ipred_avx512.asm create mode 100644 third_party/dav1d/src/x86/ipred_sse.asm create mode 100644 third_party/dav1d/src/x86/itx.h create mode 100644 third_party/dav1d/src/x86/itx16_avx2.asm create mode 100644 third_party/dav1d/src/x86/itx16_avx512.asm create mode 100644 third_party/dav1d/src/x86/itx16_sse.asm create mode 100644 third_party/dav1d/src/x86/itx_avx2.asm create mode 100644 third_party/dav1d/src/x86/itx_avx512.asm create mode 100644 third_party/dav1d/src/x86/itx_sse.asm create mode 100644 third_party/dav1d/src/x86/loopfilter.h create mode 100644 third_party/dav1d/src/x86/loopfilter16_avx2.asm create mode 100644 third_party/dav1d/src/x86/loopfilter16_avx512.asm create mode 100644 third_party/dav1d/src/x86/loopfilter16_sse.asm create mode 100644 third_party/dav1d/src/x86/loopfilter_avx2.asm create mode 100644 third_party/dav1d/src/x86/loopfilter_avx512.asm create mode 100644 third_party/dav1d/src/x86/loopfilter_sse.asm create mode 100644 third_party/dav1d/src/x86/looprestoration.h create mode 100644 third_party/dav1d/src/x86/looprestoration16_avx2.asm create mode 100644 third_party/dav1d/src/x86/looprestoration16_avx512.asm create mode 100644 third_party/dav1d/src/x86/looprestoration16_sse.asm create mode 100644 third_party/dav1d/src/x86/looprestoration_avx2.asm create mode 100644 third_party/dav1d/src/x86/looprestoration_avx512.asm create mode 100644 third_party/dav1d/src/x86/looprestoration_sse.asm create mode 100644 third_party/dav1d/src/x86/mc.h create mode 100644 third_party/dav1d/src/x86/mc16_avx2.asm create mode 100644 third_party/dav1d/src/x86/mc16_avx512.asm create mode 100644 third_party/dav1d/src/x86/mc16_sse.asm create mode 100644 third_party/dav1d/src/x86/mc_avx2.asm create mode 100644 third_party/dav1d/src/x86/mc_avx512.asm create mode 100644 third_party/dav1d/src/x86/mc_sse.asm create mode 100644 third_party/dav1d/src/x86/msac.asm create mode 100644 third_party/dav1d/src/x86/msac.h create mode 100644 third_party/dav1d/src/x86/refmvs.asm create mode 100644 third_party/dav1d/src/x86/refmvs.h (limited to 'third_party/dav1d/src/x86') diff --git a/third_party/dav1d/src/x86/cdef.h b/third_party/dav1d/src/x86/cdef.h new file mode 100644 index 0000000000..553d650741 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef.h @@ -0,0 +1,87 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/cdef.h" + +#define decl_cdef_fns(ext) \ + decl_cdef_fn(BF(dav1d_cdef_filter_4x4, ext)); \ + decl_cdef_fn(BF(dav1d_cdef_filter_4x8, ext)); \ + decl_cdef_fn(BF(dav1d_cdef_filter_8x8, ext)) + +decl_cdef_fns(avx512icl); +decl_cdef_fns(avx2); +decl_cdef_fns(sse4); +decl_cdef_fns(ssse3); +decl_cdef_fns(sse2); + +decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2)); +decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4)); +decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3)); + +static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + +#if BITDEPTH == 8 + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + + c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2); + c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2); + c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->dir = BF(dav1d_cdef_dir, ssse3); + c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3); + c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3); + c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; + + c->dir = BF(dav1d_cdef_dir, sse4); +#if BITDEPTH == 8 + c->fb[0] = BF(dav1d_cdef_filter_8x8, sse4); + c->fb[1] = BF(dav1d_cdef_filter_4x8, sse4); + c->fb[2] = BF(dav1d_cdef_filter_4x4, sse4); +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->dir = BF(dav1d_cdef_dir, avx2); + c->fb[0] = BF(dav1d_cdef_filter_8x8, avx2); + c->fb[1] = BF(dav1d_cdef_filter_4x8, avx2); + c->fb[2] = BF(dav1d_cdef_filter_4x4, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->fb[0] = BF(dav1d_cdef_filter_8x8, avx512icl); + c->fb[1] = BF(dav1d_cdef_filter_4x8, avx512icl); + c->fb[2] = BF(dav1d_cdef_filter_4x4, avx512icl); +#endif +} diff --git a/third_party/dav1d/src/x86/cdef16_avx2.asm b/third_party/dav1d/src/x86/cdef16_avx2.asm new file mode 100644 index 0000000000..4c8d3bca43 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef16_avx2.asm @@ -0,0 +1,877 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA + +%macro DIR_TABLE 1 ; stride + db 1 * %1 + 0, 2 * %1 + 0 + db 1 * %1 + 0, 2 * %1 - 2 + db -1 * %1 + 2, -2 * %1 + 4 + db 0 * %1 + 2, -1 * %1 + 4 + db 0 * %1 + 2, 0 * %1 + 4 + db 0 * %1 + 2, 1 * %1 + 4 + db 1 * %1 + 2, 2 * %1 + 4 + db 1 * %1 + 0, 2 * %1 + 2 + db 1 * %1 + 0, 2 * %1 + 0 + db 1 * %1 + 0, 2 * %1 - 2 + db -1 * %1 + 2, -2 * %1 + 4 + db 0 * %1 + 2, -1 * %1 + 4 +%endmacro + +dir_table4: DIR_TABLE 16 +dir_table8: DIR_TABLE 32 +pri_taps: dw 4, 4, 3, 3, 2, 2, 3, 3 + +dir_shift: times 2 dw 0x4000 + times 2 dw 0x1000 + +pw_2048: times 2 dw 2048 +pw_m16384: times 2 dw -16384 + +cextern cdef_dir_8bpc_avx2.main + +SECTION .text + +%macro CDEF_FILTER 2 ; w, h + DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp + movifnidn prid, r5m + movifnidn secd, r6m + mov dird, r7m + vpbroadcastd m8, [base+pw_2048] + lea dirq, [base+dir_table%1+dirq*2] + test prid, prid + jz .sec_only +%if WIN64 + vpbroadcastw m6, prim + movaps [rsp+16*0], xmm9 + movaps [rsp+16*1], xmm10 +%else + movd xm6, prid + vpbroadcastw m6, xm6 +%endif + lzcnt pridmpd, prid + rorx tmpd, prid, 2 + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, tmpd ; pri >>= 2 + mov tmpd, r8m ; damping + and prid, 4 + sub tmpd, 31 + vpbroadcastd m9, [base+pri_taps+priq+8*0] + vpbroadcastd m10, [base+pri_taps+priq+8*1] + test secd, secd + jz .pri_only +%if WIN64 + movaps r8m, xmm13 + vpbroadcastw m13, secm + movaps r4m, xmm11 + movaps r6m, xmm12 +%else + movd xm0, secd + vpbroadcastw m13, xm0 +%endif + lzcnt secd, secd + xor prid, prid + add pridmpd, tmpd + cmovs pridmpd, prid + add secd, tmpd + lea tmpq, [px] + mov [pri_shift], pridmpq + mov [sec_shift], secq +%rep %1*%2/16 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec +%endrep +%if WIN64 + movaps xmm11, r4m + movaps xmm12, r6m + movaps xmm13, r8m +%endif + jmp .pri_end +.pri_only: + add pridmpd, tmpd + cmovs pridmpd, secd + lea tmpq, [px] + mov [pri_shift], pridmpq +%rep %1*%2/16 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri +%endrep +.pri_end: +%if WIN64 + movaps xmm9, [rsp+16*0] + movaps xmm10, [rsp+16*1] +%endif +.end: + RET +.sec_only: + mov tmpd, r8m ; damping +%if WIN64 + vpbroadcastw m6, secm +%else + movd xm6, secd + vpbroadcastw m6, xm6 +%endif + tzcnt secd, secd + sub tmpd, secd + mov [sec_shift], tmpq + lea tmpq, [px] +%rep %1*%2/16 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec +%endrep + jmp .end +%if %1 == %2 +ALIGN function_align +.pri: + movsx offq, byte [dirq+4] ; off_k0 +%if %1 == 4 + mova m1, [tmpq+32*0] + punpcklqdq m1, [tmpq+32*1] ; 0 2 1 3 + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k0p0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k0p1 +%else + mova xm1, [tmpq+32*0] + vinserti128 m1, [tmpq+32*1], 1 + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+5] ; off_k1 + psubw m2, m1 ; diff_k0p0 + psubw m3, m1 ; diff_k0p1 + pabsw m4, m2 ; adiff_k0p0 + psrlw m5, m4, [pri_shift+gprsize] + psubusw m0, m6, m5 + pabsw m5, m3 ; adiff_k0p1 + pminsw m0, m4 + psrlw m4, m5, [pri_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0p0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k1p0 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k1p1 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + psubw m4, m1 ; diff_k1p0 + psubw m5, m1 ; diff_k1p1 + psignw m2, m3 ; constrain(diff_k0p1) + pabsw m3, m4 ; adiff_k1p0 + paddw m0, m2 ; constrain(diff_k0) + psrlw m2, m3, [pri_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1p1 + pminsw m7, m3 + psrlw m3, m2, [pri_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1p0) + psubusw m4, m6, m3 + pminsw m4, m2 + psignw m4, m5 ; constrain(diff_k1p1) + paddw m7, m4 ; constrain(diff_k1) + pmullw m0, m9 ; pri_tap_k0 + pmullw m7, m10 ; pri_tap_k1 + paddw m0, m7 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + add tmpq, 32*2 + paddw m0, m1 +%if %1 == 4 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r9 ], xm1 + lea dstq, [dstq+strideq*4] +%else + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] +%endif + ret +ALIGN function_align +.sec: + movsx offq, byte [dirq+8] ; off1_k0 +%if %1 == 4 + mova m1, [tmpq+32*0] + punpcklqdq m1, [tmpq+32*1] + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k0s0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k0s1 +%else + mova xm1, [tmpq+32*0] + vinserti128 m1, [tmpq+32*1], 1 + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+0] ; off2_k0 + psubw m2, m1 ; diff_k0s0 + psubw m3, m1 ; diff_k0s1 + pabsw m4, m2 ; adiff_k0s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m0, m6, m5 + pabsw m5, m3 ; adiff_k0s1 + pminsw m0, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k0s2 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k0s3 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+9] ; off1_k1 + psubw m4, m1 ; diff_k0s2 + psubw m5, m1 ; diff_k0s3 + psignw m2, m3 ; constrain(diff_k0s1) + pabsw m3, m4 ; adiff_k0s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k0s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k0s2) + psubusw m4, m6, m3 + pminsw m4, m2 +%if %1 == 4 + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k1s0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k1s1 +%else + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+1] ; off2_k1 + paddw m0, m7 + psignw m4, m5 ; constrain(diff_k0s3) + paddw m0, m4 ; constrain(diff_k0) + psubw m2, m1 ; diff_k1s0 + psubw m3, m1 ; diff_k1s1 + paddw m0, m0 ; sec_tap_k0 + pabsw m4, m2 ; adiff_k1s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m7, m6, m5 + pabsw m5, m3 ; adiff_k1s1 + pminsw m7, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m7, m2 ; constrain(diff_k1s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k1s2 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k1s3 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + paddw m0, m7 + psubw m4, m1 ; diff_k1s2 + psubw m5, m1 ; diff_k1s3 + psignw m2, m3 ; constrain(diff_k1s1) + pabsw m3, m4 ; adiff_k1s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1s2) + psubusw m4, m6, m3 + pminsw m4, m2 + paddw m0, m7 + psignw m4, m5 ; constrain(diff_k1s3) + paddw m0, m4 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + add tmpq, 32*2 + paddw m0, m1 +%if %1 == 4 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r9 ], xm1 + lea dstq, [dstq+strideq*4] +%else + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] +%endif + ret +ALIGN function_align +.pri_sec: + movsx offq, byte [dirq+8] ; off2_k0 +%if %1 == 4 + mova m1, [tmpq+32*0] + punpcklqdq m1, [tmpq+32*1] + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k0s0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k0s1 +%else + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+0] ; off3_k0 + pmaxsw m11, m2, m3 + pminuw m12, m2, m3 + psubw m2, m1 ; diff_k0s0 + psubw m3, m1 ; diff_k0s1 + pabsw m4, m2 ; adiff_k0s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m0, m13, m5 + pabsw m5, m3 ; adiff_k0s1 + pminsw m0, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0s0) + psubusw m2, m13, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k0s2 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k0s3 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+9] ; off2_k1 + psignw m2, m3 ; constrain(diff_k0s1) + pmaxsw m11, m4 + pminuw m12, m4 + pmaxsw m11, m5 + pminuw m12, m5 + psubw m4, m1 ; diff_k0s2 + psubw m5, m1 ; diff_k0s3 + paddw m0, m2 + pabsw m3, m4 ; adiff_k0s2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m13, m2 + pabsw m2, m5 ; adiff_k0s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k0s2) + psubusw m4, m13, m3 + pminsw m4, m2 +%if %1 == 4 + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k1s0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k1s1 +%else + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+1] ; off3_k1 + paddw m0, m7 + psignw m4, m5 ; constrain(diff_k0s3) + pmaxsw m11, m2 + pminuw m12, m2 + pmaxsw m11, m3 + pminuw m12, m3 + paddw m0, m4 ; constrain(diff_k0) + psubw m2, m1 ; diff_k1s0 + psubw m3, m1 ; diff_k1s1 + paddw m0, m0 ; sec_tap_k0 + pabsw m4, m2 ; adiff_k1s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m7, m13, m5 + pabsw m5, m3 ; adiff_k1s1 + pminsw m7, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m7, m2 ; constrain(diff_k1s0) + psubusw m2, m13, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k1s2 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k1s3 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+4] ; off1_k0 + paddw m0, m7 + psignw m2, m3 ; constrain(diff_k1s1) + pmaxsw m11, m4 + pminuw m12, m4 + pmaxsw m11, m5 + pminuw m12, m5 + psubw m4, m1 ; diff_k1s2 + psubw m5, m1 ; diff_k1s3 + pabsw m3, m4 ; adiff_k1s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m13, m2 + pabsw m2, m5 ; adiff_k1s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1s2) + psubusw m4, m13, m3 + pminsw m4, m2 + paddw m0, m7 +%if %1 == 4 + movu m2, [tmpq+offq+32*0] + punpcklqdq m2, [tmpq+offq+32*1] ; k0p0 + neg offq + movu m3, [tmpq+offq+32*0] + punpcklqdq m3, [tmpq+offq+32*1] ; k0p1 +%else + movu xm2, [tmpq+offq+32*0] + vinserti128 m2, [tmpq+offq+32*1], 1 + neg offq + movu xm3, [tmpq+offq+32*0] + vinserti128 m3, [tmpq+offq+32*1], 1 +%endif + movsx offq, byte [dirq+5] ; off1_k1 + psignw m4, m5 ; constrain(diff_k1s3) + pmaxsw m11, m2 + pminuw m12, m2 + pmaxsw m11, m3 + pminuw m12, m3 + psubw m2, m1 ; diff_k0p0 + psubw m3, m1 ; diff_k0p1 + paddw m0, m4 + pabsw m4, m2 ; adiff_k0p0 + psrlw m5, m4, [pri_shift+gprsize] + psubusw m7, m6, m5 + pabsw m5, m3 ; adiff_k0p1 + pminsw m7, m4 + psrlw m4, m5, [pri_shift+gprsize] + psignw m7, m2 ; constrain(diff_k0p0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movu m4, [tmpq+offq+32*0] + punpcklqdq m4, [tmpq+offq+32*1] ; k1p0 + neg offq + movu m5, [tmpq+offq+32*0] + punpcklqdq m5, [tmpq+offq+32*1] ; k1p1 +%else + movu xm4, [tmpq+offq+32*0] + vinserti128 m4, [tmpq+offq+32*1], 1 + neg offq + movu xm5, [tmpq+offq+32*0] + vinserti128 m5, [tmpq+offq+32*1], 1 +%endif + psignw m2, m3 ; constrain(diff_k0p1) + paddw m7, m2 ; constrain(diff_k0) + pmaxsw m11, m4 + pminuw m12, m4 + pmaxsw m11, m5 + pminuw m12, m5 + psubw m4, m1 ; diff_k1p0 + psubw m5, m1 ; diff_k1p1 + pabsw m3, m4 ; adiff_k1p0 + pmullw m7, m9 ; pri_tap_k0 + paddw m0, m7 + psrlw m2, m3, [pri_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1p1 + pminsw m7, m3 + psrlw m3, m2, [pri_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1p0) + psubusw m4, m6, m3 + pminsw m4, m2 + psignw m4, m5 ; constrain(diff_k1p1) + paddw m7, m4 ; constrain(diff_k1) + pmullw m7, m10 ; pri_tap_k1 + paddw m0, m7 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + add tmpq, 32*2 + pmaxsw m11, m1 + pminuw m12, m1 + paddw m0, m1 + pminsw m0, m11 + pmaxsw m0, m12 +%if %1 == 4 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r9 ], xm1 + lea dstq, [dstq+strideq*4] +%else + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] +%endif + ret +%endif +%endmacro + +INIT_YMM avx2 +cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \ + pri, sec, edge +%if WIN64 + %define px rsp+16*6 + %define offq r8 + %define pri_shift rsp+16*2 + %define sec_shift rsp+16*3 +%else + %define px rsp+16*4 + %define offq r4 + %define pri_shift rsp+16*0 + %define sec_shift rsp+16*1 +%endif + %define base r8-dir_table4 + mov edged, r9m + lea r8, [dir_table4] + movu xm0, [dstq+strideq*0] + movu xm1, [dstq+strideq*1] + lea r9, [strideq*3] + movu xm2, [dstq+strideq*2] + movu xm3, [dstq+r9 ] + vpbroadcastd m7, [base+pw_m16384] + mova [px+16*0+0], xm0 + mova [px+16*1+0], xm1 + mova [px+16*2+0], xm2 + mova [px+16*3+0], xm3 + test edgeb, 4 ; HAVE_TOP + jz .no_top + movu xm0, [topq+strideq*0] + movu xm1, [topq+strideq*1] + mova [px-16*2+0], xm0 + mova [px-16*1+0], xm1 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd xm0, [topq+strideq*0-4] + movd xm1, [topq+strideq*1-4] + movd [px-16*2-4], xm0 + movd [px-16*1-4], xm1 + jmp .top_done +.no_top: + mova [px-16*2+0], m7 +.top_no_left: + movd [px-16*2-4], xm7 + movd [px-16*1-4], xm7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movu xm0, [botq+strideq*0] + movu xm1, [botq+strideq*1] + mova [px+16*4+0], xm0 + mova [px+16*5+0], xm1 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd xm0, [botq+strideq*0-4] + movd xm1, [botq+strideq*1-4] + movd [px+16*4-4], xm0 + movd [px+16*5-4], xm1 + jmp .bottom_done +.no_bottom: + mova [px+16*4+0], m7 +.bottom_no_left: + movd [px+16*4-4], xm7 + movd [px+16*5-4], xm7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movd xm0, [leftq+4*0] + movd xm1, [leftq+4*1] + movd xm2, [leftq+4*2] + movd xm3, [leftq+4*3] + movd [px+16*0-4], xm0 + movd [px+16*1-4], xm1 + movd [px+16*2-4], xm2 + movd [px+16*3-4], xm3 + jmp .left_done +.no_left: + REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5 +.padding_done: + CDEF_FILTER 4, 4 + +cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \ + pri, sec, edge + mov edged, r9m + movu xm0, [dstq+strideq*0] + movu xm1, [dstq+strideq*1] + lea r9, [strideq*3] + movu xm2, [dstq+strideq*2] + movu xm3, [dstq+r9 ] + lea r6, [dstq+strideq*4] + movu xm4, [r6 +strideq*0] + movu xm5, [r6 +strideq*1] + movu xm6, [r6 +strideq*2] + movu xm7, [r6 +r9 ] + lea r8, [dir_table4] + mova [px+16*0+0], xm0 + mova [px+16*1+0], xm1 + mova [px+16*2+0], xm2 + mova [px+16*3+0], xm3 + mova [px+16*4+0], xm4 + mova [px+16*5+0], xm5 + mova [px+16*6+0], xm6 + mova [px+16*7+0], xm7 + vpbroadcastd m7, [base+pw_m16384] + test edgeb, 4 ; HAVE_TOP + jz .no_top + movu xm0, [topq+strideq*0] + movu xm1, [topq+strideq*1] + mova [px-16*2+0], xm0 + mova [px-16*1+0], xm1 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd xm0, [topq+strideq*0-4] + movd xm1, [topq+strideq*1-4] + movd [px-16*2-4], xm0 + movd [px-16*1-4], xm1 + jmp .top_done +.no_top: + mova [px-16*2+0], m7 +.top_no_left: + movd [px-16*2-4], xm7 + movd [px-16*1-4], xm7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movu xm0, [botq+strideq*0] + movu xm1, [botq+strideq*1] + mova [px+16*8+0], xm0 + mova [px+16*9+0], xm1 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd xm0, [botq+strideq*0-4] + movd xm1, [botq+strideq*1-4] + movd [px+16*8-4], xm0 + movd [px+16*9-4], xm1 + jmp .bottom_done +.no_bottom: + mova [px+16*8+0], m7 +.bottom_no_left: + movd [px+16*8-4], xm7 + movd [px+16*9-4], xm7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movd xm0, [leftq+4*0] + movd xm1, [leftq+4*1] + movd xm2, [leftq+4*2] + movd xm3, [leftq+4*3] + movd [px+16*0-4], xm0 + movd [px+16*1-4], xm1 + movd [px+16*2-4], xm2 + movd [px+16*3-4], xm3 + movd xm0, [leftq+4*4] + movd xm1, [leftq+4*5] + movd xm2, [leftq+4*6] + movd xm3, [leftq+4*7] + movd [px+16*4-4], xm0 + movd [px+16*5-4], xm1 + movd [px+16*6-4], xm2 + movd [px+16*7-4], xm3 + jmp .left_done +.no_left: + REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +.padding_done: + CDEF_FILTER 4, 8 + +cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \ + pri, sec, edge +%if WIN64 + %define px rsp+32*4 +%else + %define px rsp+32*3 +%endif + %define base r8-dir_table8 + mov edged, r9m + movu m0, [dstq+strideq*0] + movu m1, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movu m2, [r6 +strideq*0] + movu m3, [r6 +strideq*1] + lea r6, [r6 +strideq*2] + movu m4, [r6 +strideq*0] + movu m5, [r6 +strideq*1] + lea r6, [r6 +strideq*2] + movu m6, [r6 +strideq*0] + movu m7, [r6 +strideq*1] + lea r8, [dir_table8] + mova [px+32*0+0], m0 + mova [px+32*1+0], m1 + mova [px+32*2+0], m2 + mova [px+32*3+0], m3 + mova [px+32*4+0], m4 + mova [px+32*5+0], m5 + mova [px+32*6+0], m6 + mova [px+32*7+0], m7 + vpbroadcastd m7, [base+pw_m16384] + test edgeb, 4 ; HAVE_TOP + jz .no_top + movu m0, [topq+strideq*0] + movu m1, [topq+strideq*1] + mova [px-32*2+0], m0 + mova [px-32*1+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd xm0, [topq+strideq*0-4] + movd xm1, [topq+strideq*1-4] + movd [px-32*2-4], xm0 + movd [px-32*1-4], xm1 + jmp .top_done +.no_top: + mova [px-32*2+0], m7 + mova [px-32*1+0], m7 +.top_no_left: + movd [px-32*2-4], xm7 + movd [px-32*1-4], xm7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] + mova [px+32*8+0], m0 + mova [px+32*9+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd xm0, [botq+strideq*0-4] + movd xm1, [botq+strideq*1-4] + movd [px+32*8-4], xm0 + movd [px+32*9-4], xm1 + jmp .bottom_done +.no_bottom: + mova [px+32*8+0], m7 + mova [px+32*9+0], m7 +.bottom_no_left: + movd [px+32*8-4], xm7 + movd [px+32*9-4], xm7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movd xm0, [leftq+4*0] + movd xm1, [leftq+4*1] + movd xm2, [leftq+4*2] + movd xm3, [leftq+4*3] + movd [px+32*0-4], xm0 + movd [px+32*1-4], xm1 + movd [px+32*2-4], xm2 + movd [px+32*3-4], xm3 + movd xm0, [leftq+4*4] + movd xm1, [leftq+4*5] + movd xm2, [leftq+4*6] + movd xm3, [leftq+4*7] + movd [px+32*4-4], xm0 + movd [px+32*5-4], xm1 + movd [px+32*6-4], xm2 + movd [px+32*7-4], xm3 + jmp .left_done +.no_left: + REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +.padding_done: + CDEF_FILTER 8, 8 + +cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax + lea r6, [dir_shift] + shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc + vpbroadcastd m4, [r6+bdmaxq*4] + lea r6, [strideq*3] + mova xm0, [srcq+strideq*0] + mova xm1, [srcq+strideq*1] + mova xm2, [srcq+strideq*2] + mova xm3, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + vinserti128 m0, [srcq+r6 ], 1 + vinserti128 m1, [srcq+strideq*2], 1 + vinserti128 m2, [srcq+strideq*1], 1 + vinserti128 m3, [srcq+strideq*0], 1 + REPX {pmulhuw x, m4}, m0, m1, m2, m3 + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/cdef16_avx512.asm b/third_party/dav1d/src/x86/cdef16_avx512.asm new file mode 100644 index 0000000000..6d625a02a0 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef16_avx512.asm @@ -0,0 +1,622 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21 + db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29 + db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37 + db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45 +end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 +edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111 + dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011 + dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111 +pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4 +cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6 + dw 1, 2, 1, 10, 9, 18, 8, 17 + dw 8, 16, 8, 15, -7,-14, 1, -6 +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28 + db 2, 4, 2, 36, 34, 68, 32, 66 + db 32, 64, 32, 62,-30,-60, 2,-28 +pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3 +sec_taps4: dw 32, 16 +pw_m16384: times 2 dw -16384 +pw_2048: times 2 dw 2048 +pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4) +edge_mask8: dw 0x2121, 0x2020, 0x0101 + +SECTION .text + +%macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp + psubw %1, %2, %3 + pabsw %1, %1 + vpcmpgtw k1, %3, %2 + vpsrlvw %7, %1, %6 + psubusw %7, %5, %7 + pminsw %1, %7 + vpsubw %1{k1}, %4, %1 +%endmacro + +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 +; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7 +; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7 + +INIT_ZMM avx512icl +cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r6-cdef_dirs4 + lea r6, [cdef_dirs4] + movu xm3, [dstq+strideq*0] + vinserti32x4 ym3, [dstq+strideq*1], 1 + mova xm2, [leftq] + lea r2, [dstq+strideq*2] + vinserti32x4 m3, [r2+strideq*0], 2 + mova m5, [base+cdef_perm] + vinserti32x4 m3, [r2+strideq*1], 3 + vpermt2d m2, m5, m3 + vinserti32x4 m1, m2, [topq+strideq*0-4], 0 + vinserti32x4 m1, [topq+strideq*1-4], 1 + mov r3d, edgem + movifnidn prid, prim + punpcklwd m3, m3 ; px + psrlw m5, 8 + vpbroadcastd m0, [base+pd_268435568] + pxor m12, m12 + cmp r3d, 0x0f + jne .mask_edges + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 +.main: + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + vpbroadcastd m15, [base+pri_taps4+priq] + xor prid, prid + add r4d, r3d + cmovns prid, r4d ; pri_shift + mov r4d, dirm + vpbroadcastw m14, prid + mov r5d, secm + vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] + call .constrain + test r5d, r5d + jz .end_no_clip + lzcnt r5d, r5d + vpbroadcastw m13, secm + add r3d, r5d + pminuw m6, m3, m8 + pmaxsw m7, m3, m8 + pminuw m6, m9 + pmaxsw m7, m9 + call .constrain_sec + pminuw m6, m8 + pmaxsw m7, m8 + pminuw m6, m9 + pmaxsw m7, m9 + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain + pminuw m6, m8 + pmaxsw m7, m8 + pminuw m6, m9 + pmaxsw m7, m9 + psrldq m8, m6, 2 + vpshldd m3, m0, 8 + psrldq m9, m7, 2 + paddd m0, m3 + pminuw m6, m8 + psrldq m0, 1 + pmaxsw m7, m9 + pmaxsw m0, m6 + pminsw m0, m7 + vpmovdw ym0, m0 + jmp .end +.sec_only: + tzcnt r5d, secm + mov r3d, dampingm + vpbroadcastw m13, secm + mov r4d, dirm + sub r3d, r5d ; sec_shift + call .constrain_sec + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain +.end_no_clip: + mova ym1, [base+end_perm4] + vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4) + paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + vpermb m0, m1, m0 +.end: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm0, ym0, 1 + movq [r2+strideq*0], xm0 + movhps [r2+strideq*1], xm0 + RET +.mask_edges: + vpbroadcastd m6, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 + kmovw k1, [base+edge_mask4-8+r3*2] + jmp .mask_edges_main +.mask_edges_no_bottom: + kmovw k1, [base+edge_mask4+8+r3*2] +.mask_edges_main: + or r3d, 0x04 + vmovdqa32 m1{k1}, m6 ; edge pixels = -16384 + kmovw k1, [base+edge_mask4-8+r3*2] + vmovdqa32 m2{k1}, m6 + jmp .main +.constrain_sec: + vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] + vpbroadcastw m14, r3d + vpbroadcastd m15, [base+sec_taps4] +.constrain: + paddw m8, m5, m9 + vpermi2w m8, m1, m2 ; k0p0 k1p0 + psubw m9, m5, m9 + vpermi2w m9, m1, m2 ; k0p1 k1p1 + CONSTRAIN m10, m8, m3, m12, m13, m14, m11 + vpdpwssd m0, m10, m15 + CONSTRAIN m10, m9, m3, m12, m13, m14, m11 + vpdpwssd m0, m10, m15 + ret + +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 +; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 +; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 + +cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge + lea r6, [cdef_dirs4] + movu xm18, [dstq+strideq*0] + vinserti128 ym18, [dstq+strideq*1], 1 + mova xm1, [leftq+16*0] + mova xm2, [leftq+16*1] + lea r2, [strideq*3] + vinserti32x4 m18, [dstq+strideq*2], 2 + mova m5, [base+cdef_perm] + vinserti32x4 m18, [dstq+r2 ], 3 + vpermt2d m1, m5, m18 + vinserti32x4 m0, m1, [topq+strideq*0-4], 0 + vinserti32x4 m0, [topq+strideq*1-4], 1 + lea r3, [dstq+strideq*4] + movu xm19, [r3+strideq*0] + vinserti128 ym19, [r3+strideq*1], 1 + vinserti32x4 m19, [r3+strideq*2], 2 + vinserti32x4 m19, [r3+r2 ], 3 + mov r3d, edgem + movifnidn prid, prim + vpermt2d m2, m5, m19 + vpbroadcastd m16, [base+pd_268435568] + pxor m12, m12 + punpcklwd m18, m18 ; px (top) + psrlw m5, 8 + punpcklwd m19, m19 ; px (bottom) + mova m17, m16 + vshufi32x4 m1, m2, q3210 + cmp r3d, 0x0f + jne .mask_edges + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 +.main: + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + vpbroadcastd m15, [base+pri_taps4+priq] + xor prid, prid + add r4d, r3d + cmovns prid, r4d ; pri_shift + mov r4d, dirm + vpbroadcastw m14, prid + mov r5d, secm + vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] + call .constrain + test r5d, r5d + jz .end_no_clip + lzcnt r5d, r5d + vpbroadcastw m13, secm + add r3d, r5d + pminuw m3, m18, m6 + pmaxsw m4, m18, m6 + pminuw m20, m19, m7 + pmaxsw m21, m19, m7 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + call .constrain_sec + pminuw m3, m6 + pmaxsw m4, m6 + pminuw m20, m7 + pmaxsw m21, m7 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain + pminuw m3, m6 + pmaxsw m4, m6 + mov r3, 0xcccccccccccccccc + pminuw m20, m7 + pmaxsw m21, m7 + kmovq k1, r3 + pminuw m3, m8 + pmaxsw m4, m8 + pminuw m20, m9 + pmaxsw m21, m9 + vbroadcasti32x4 m0, [base+deint_shuf] + vpshldd m6, m20, m3, 16 + vmovdqu8 m3{k1}, m20 + vpshldd m18, m16, 8 + vpshldd m7, m21, m4, 16 + vmovdqu8 m4{k1}, m21 + vpshldd m19, m17, 8 + pminuw m3, m6 + paddd m16, m18 + pmaxsw m4, m7 + paddd m17, m19 + psrldq m16, 1 + palignr m16{k1}, m17, m17, 15 + lea r6, [dstq+strideq*4] + pmaxsw m16, m3 + pminsw m16, m4 + pshufb m16, m0 + movq [dstq+strideq*0], xm16 + movhps [r6 +strideq*0], xm16 + vextracti128 xm17, ym16, 1 + movq [dstq+strideq*1], xm17 + movhps [r6 +strideq*1], xm17 + vextracti32x4 xm17, m16, 2 + movq [dstq+strideq*2], xm17 + movhps [r6 +strideq*2], xm17 + vextracti32x4 xm16, m16, 3 + movq [dstq+r2 ], xm16 + movhps [r6 +r2 ], xm16 + RET +.sec_only: + mov r4d, dirm + tzcnt r5d, secm + mov r3d, dampingm + vpbroadcastw m13, secm + sub r3d, r5d ; sec_shift + call .constrain_sec + vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] + call .constrain +.end_no_clip: + mova ym20, [base+end_perm4] + vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m19, m17, 8 + paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddd m17, m19 + vpermb m16, m20, m16 + vpermb m17, m20, m17 + movq [dstq+strideq*0], xm16 + movhps [dstq+strideq*1], xm16 + vextracti128 xm16, ym16, 1 + movq [dstq+strideq*2], xm16 + movhps [dstq+r2 ], xm16 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm17 + movhps [dstq+strideq*1], xm17 + vextracti128 xm17, ym17, 1 + movq [dstq+strideq*2], xm17 + movhps [dstq+r2 ], xm17 + RET +.mask_edges: + vpbroadcastd m6, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + vinserti32x4 m2, [botq+strideq*0-4], 2 + vinserti32x4 m2, [botq+strideq*1-4], 3 + kmovw k1, [base+edge_mask4-8+r3*2] + jmp .mask_edges_main +.mask_edges_no_bottom: + kmovw k1, [base+edge_mask4+8+r3*2] +.mask_edges_main: + mov r4d, r3d + or r3d, 0x0c + vmovdqa32 m0{k1}, m6 ; edge pixels = -16384 + kmovw k1, [base+edge_mask4-8+r3*2] + or r4d, 0x04 + vmovdqa32 m1{k1}, m6 + kmovw k1, [base+edge_mask4-8+r4*2] + vmovdqa32 m2{k1}, m6 + jmp .main +.constrain_sec: + vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] + vpbroadcastw m14, r3d + vpbroadcastd m15, [base+sec_taps4] +.constrain: + paddw m7, m5, m9 + mova m6, m0 + vpermt2w m6, m7, m1 ; k0p0 k1p0 (top) + psubw m9, m5, m9 + mova m8, m0 + vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom) + CONSTRAIN m10, m6, m18, m12, m13, m14, m11 + vpermt2w m8, m9, m1 ; k0p1 k1p1 (top) + vpdpwssd m16, m10, m15 + CONSTRAIN m10, m7, m19, m12, m13, m14, m11 + vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom) + vpdpwssd m17, m10, m15 + CONSTRAIN m10, m8, m18, m12, m13, m14, m11 + vpdpwssd m16, m10, m15 + CONSTRAIN m10, m9, m19, m12, m13, m14, m11 + vpdpwssd m17, m10, m15 + ret + +cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r6-cdef_dirs8 + lea r6, [cdef_dirs8] + movu ym17, [dstq+strideq*0] + vinserti32x8 m17, [dstq+strideq*1], 1 + movq xm4, [leftq+8*0] + movq xm5, [leftq+8*1] + psrld m2, [base+cdef_perm], 16 + movq xm6, [leftq+8*2] + movq xm7, [leftq+8*3] + lea r2, [strideq*3] + movu ym16, [topq+strideq*0-4] + vinserti32x8 m16, [topq+strideq*1-4], 1 + lea r3, [dstq+strideq*4] + movu ym18, [dstq+strideq*2] + vinserti32x8 m18, [dstq+r2 ], 1 + movu ym19, [r3+strideq*0] + vinserti32x8 m19, [r3+strideq*1], 1 + movu ym20, [r3+strideq*2] + vinserti32x8 m20, [r3+r2 ], 1 + vshufi32x4 m0, m17, m18, q2020 ; px (top) + mov r3d, edgem + vshufi32x4 m1, m19, m20, q2020 ; px (bottom) + movifnidn prid, prim + vpermt2d m17, m2, m4 + vpermt2d m18, m2, m5 + pxor m12, m12 + vpermt2d m19, m2, m6 + vpermt2d m20, m2, m7 + cmp r3d, 0x0f + jne .mask_edges + movu ym21, [botq+strideq*0-4] + vinserti32x8 m21, [botq+strideq*1-4], 1 +.main: + mova [rsp+64*0], m16 ; top + mova [rsp+64*1], m17 ; 0 1 + mova [rsp+64*2], m18 ; 2 3 + mova [rsp+64*3], m19 ; 4 5 + mova [rsp+64*4], m20 ; 6 7 + mova [rsp+64*5], m21 ; bottom + test prid, prid + jz .sec_only + lzcnt r4d, prid + rorx r3d, prid, 2 + vpbroadcastw m13, prim + cmp dword r10m, 0xfff ; if (bpc == 12) + cmove prid, r3d ; pri >>= 2 + mov r3d, dampingm + and prid, 4 + sub r3d, 31 + add r4d, r3d ; pri_shift + vpbroadcastw m14, r4d + mov r4d, dirm + vpbroadcastd m2, [base+pri_taps8+priq*2+0] + vpbroadcastd m3, [base+pri_taps8+priq*2+4] + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1 + pmaxsw m14, m12 + call .constrain + mov r5d, secm + pmullw m16, m8, m2 + pmullw m17, m9, m2 + test r5d, r5d + jnz .pri_sec + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 + call .constrain + pmullw m8, m3 + pmullw m9, m3 + jmp .end_no_clip +.pri_sec: + lzcnt r5d, r5d + add r3d, r5d ; sec_shift + movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 + pminuw m18, m0, m4 + pmaxsw m19, m0, m4 + pminuw m20, m1, m5 + pmaxsw m21, m1, m5 + call .min_max_constrain2 + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2 + pmullw m8, m3 + pmullw m9, m3 + vpbroadcastw m13, secm + vpbroadcastw m14, r3d + paddw m16, m8 + paddw m17, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3 + mova m2, m8 + mova m3, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2 + paddw m2, m8 + paddw m3, m9 + call .min_max_constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3 + paddw m2, m2 + paddw m3, m3 + paddw m16, m8 + paddw m17, m9 + call .min_max_constrain + vpbroadcastd m10, [base+pw_2048] + paddw m16, m2 + paddw m17, m3 + paddw m16, m8 + paddw m17, m9 + psraw m8, m16, 15 + psraw m9, m17, 15 + paddw m16, m8 + paddw m17, m9 + pmulhrsw m16, m10 + pmulhrsw m17, m10 + pminuw m18, m4 + pmaxsw m19, m4 + pminuw m20, m5 + pmaxsw m21, m5 + pminuw m18, m6 + pmaxsw m19, m6 + pminuw m20, m7 + pmaxsw m21, m7 + paddw m16, m0 + paddw m17, m1 + pmaxsw m16, m18 + pmaxsw m17, m20 + pminsw m16, m19 + pminsw m17, m21 + jmp .end +.sec_only: + tzcnt r5d, secm + mov r4d, dirm + mov r3d, dampingm + vpbroadcastw m13, secm + sub r3d, r5d + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] + vpbroadcastw m14, r3d + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] + mova m16, m8 + mova m17, m9 + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] + paddw m16, m8 + paddw m17, m9 + call .constrain + movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] + paddw m16, m16 + paddw m17, m17 + paddw m16, m8 + paddw m17, m9 + call .constrain +.end_no_clip: + vpbroadcastd m10, [base+pw_2048] + paddw m16, m8 + paddw m17, m9 + psraw m8, m16, 15 + psraw m9, m17, 15 + paddw m16, m8 + paddw m17, m9 + pmulhrsw m16, m10 + pmulhrsw m17, m10 + paddw m16, m0 + paddw m17, m1 +.end: + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm17 + vextracti128 [dstq+strideq*1], ym17, 1 + vextracti32x4 [dstq+strideq*2], m17, 2 + vextracti32x4 [dstq+r2 ], m17, 3 + RET +.mask_edges: + vpbroadcastd m2, [base+pw_m16384] + test r3b, 0x08 + jz .mask_edges_no_bottom ; avoid buffer overread + movu ym21, [botq+strideq*0-4] + vinserti32x8 m21, [botq+strideq*1-4], 1 + jmp .mask_edges_top +.mask_edges_no_bottom: + mova m21, m2 +.mask_edges_top: + test r3b, 0x04 + jnz .mask_edges_main + mova m16, m2 +.mask_edges_main: + and r3d, 0x03 + cmp r3d, 0x03 + je .main + kmovw k1, [base+edge_mask8+r3*2] + vmovdqa32 m16{k1}, m2 ; edge pixels = -16384 + vmovdqa32 m17{k1}, m2 + vmovdqa32 m18{k1}, m2 + vmovdqa32 m19{k1}, m2 + vmovdqa32 m20{k1}, m2 + vmovdqa32 m21{k1}, m2 + jmp .main +ALIGN function_align +.min_max_constrain: + pminuw m18, m4 + pmaxsw m19, m4 + pminuw m20, m5 + pmaxsw m21, m5 +.min_max_constrain2: + pminuw m18, m6 + pmaxsw m19, m6 + pminuw m20, m7 + pmaxsw m21, m7 +.constrain: + %define tmp rsp+gprsize+68 + movu m4, [tmp+r5+64*0] + vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top) + movu m5, [tmp+r5+64*2] + vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom) + neg r5 + movu m6, [tmp+r5+64*0] + vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top) + movu m7, [tmp+r5+64*2] + vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom) + CONSTRAIN m8, m4, m0, m12, m13, m14, m15 + CONSTRAIN m9, m5, m1, m12, m13, m14, m15 + CONSTRAIN m10, m6, m0, m12, m13, m14, m15 + CONSTRAIN m11, m7, m1, m12, m13, m14, m15 + paddw m8, m10 + paddw m9, m11 + ret + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/cdef16_sse.asm b/third_party/dav1d/src/x86/cdef16_sse.asm new file mode 100644 index 0000000000..1bd67ace64 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef16_sse.asm @@ -0,0 +1,1033 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright (c) 2017-2021, The rav1e contributors +; Copyright (c) 2021, Nathan Egge +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +%macro DUP8 1-* + %rep %0 + times 8 dw %1 + %rotate 1 + %endrep +%endmacro + +pri_taps: DUP8 4, 2, 3, 3 +dir_table: db 1 * 32 + 0, 2 * 32 + 0 + db 1 * 32 + 0, 2 * 32 - 2 + db -1 * 32 + 2, -2 * 32 + 4 + db 0 * 32 + 2, -1 * 32 + 4 + db 0 * 32 + 2, 0 * 32 + 4 + db 0 * 32 + 2, 1 * 32 + 4 + db 1 * 32 + 2, 2 * 32 + 4 + db 1 * 32 + 0, 2 * 32 + 2 + db 1 * 32 + 0, 2 * 32 + 0 + db 1 * 32 + 0, 2 * 32 - 2 + db -1 * 32 + 2, -2 * 32 + 4 + db 0 * 32 + 2, -1 * 32 + 4 + +dir_shift: times 4 dw 0x4000 + times 4 dw 0x1000 + +pw_128: times 4 dw 128 +pw_2048: times 8 dw 2048 +pw_m16384: times 8 dw -16384 + +cextern cdef_dir_8bpc_ssse3.main +cextern cdef_dir_8bpc_sse4.main +cextern shufw_6543210x + +SECTION .text + +%if ARCH_X86_32 +DECLARE_REG_TMP 5, 3 +%elif WIN64 +DECLARE_REG_TMP 8, 4 +%else +DECLARE_REG_TMP 8, 6 +%endif + +%macro CDEF_FILTER 2 ; w, h +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir + mova m8, [base+pw_2048] +%else + DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir + %define m8 [base+pw_2048] + %define m9 [rsp+16*1+gprsize] + %define m10 [rsp+16*2+gprsize] +%endif + movifnidn prid, r5m + movifnidn secd, r6m + test prid, prid + jz .sec_only + movd m6, r5m +%if ARCH_X86_32 + mov [rsp+24], pridmpd +%endif + bsr pridmpd, prid + lea tmpd, [priq*4] + cmp dword r10m, 0x3ff ; if (bpc == 10) + cmove prid, tmpd ; pri <<= 2 + mov tmpd, r8m ; damping + mov dird, r7m + and prid, 16 + pshufb m6, m7 ; splat + lea dirq, [base+dir_table+dirq*2] + lea priq, [base+pri_taps+priq*2] + test secd, secd + jz .pri_only + mova [rsp], m6 + movd m6, secd + tzcnt secd, secd + sub pridmpd, tmpd + sub tmpd, secd + pshufb m6, m7 + xor secd, secd + neg pridmpd + cmovs pridmpd, secd +%if ARCH_X86_32 + mov [pri_shift+4], secd + mov [sec_shift+4], secd +%endif + mov [pri_shift+0], pridmpq + mov [sec_shift+0], tmpq + lea tmpq, [px] +%if WIN64 + movaps r4m, m9 + movaps r6m, m10 +%elif ARCH_X86_32 + mov pridmpd, [rsp+24] +%endif +%rep %1*%2/8 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec +%endrep +%if WIN64 + movaps m9, r4m + movaps m10, r6m +%endif + jmp .end +.pri_only: + sub tmpd, pridmpd + cmovs tmpd, secd +%if ARCH_X86_32 + mov pridmpd, [rsp+24] + mov [pri_shift+4], secd +%endif + mov [pri_shift+0], tmpq + lea tmpq, [px] +%rep %1*%2/8 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri +%endrep +.end: + RET +.sec_only: + mov tmpd, r8m ; damping + movd m6, r6m + tzcnt secd, secd + mov dird, r7m + pshufb m6, m7 + sub tmpd, secd + lea dirq, [base+dir_table+dirq*2] +%if ARCH_X86_32 + mov [sec_shift+4], prid +%endif + mov [sec_shift+0], tmpq + lea tmpq, [px] +%rep %1*%2/8 + call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec +%endrep + jmp .end +%if %1 == %2 + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir + %else + DEFINE_ARGS dst, stride, tmp, off, pri, _, dir + %endif +ALIGN function_align +.pri: + movsx offq, byte [dirq+4] ; off_k0 +%if %1 == 4 + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + movq m2, [tmpq+offq+32*0] ; k0p0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k0p1 + movhps m3, [tmpq+offq+32*1] +%else + mova m1, [dstq] + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+5] ; off_k1 + psubw m2, m1 ; diff_k0p0 + psubw m3, m1 ; diff_k0p1 + pabsw m4, m2 ; adiff_k0p0 + psrlw m5, m4, [pri_shift+gprsize] + psubusw m0, m6, m5 + pabsw m5, m3 ; adiff_k0p1 + pminsw m0, m4 + psrlw m4, m5, [pri_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0p0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k1p0 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k1p1 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + psubw m4, m1 ; diff_k1p0 + psubw m5, m1 ; diff_k1p1 + psignw m2, m3 ; constrain(diff_k0p1) + pabsw m3, m4 ; adiff_k1p0 + paddw m0, m2 ; constrain(diff_k0) + psrlw m2, m3, [pri_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1p1 + pminsw m7, m3 + psrlw m3, m2, [pri_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1p0) + psubusw m4, m6, m3 + pminsw m4, m2 + psignw m4, m5 ; constrain(diff_k1p1) + paddw m7, m4 ; constrain(diff_k1) + pmullw m0, [priq+16*0] ; pri_tap_k0 + pmullw m7, [priq+16*1] ; pri_tap_k1 + paddw m0, m7 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + paddw m0, m1 +%if %1 == 4 + add tmpq, 32*2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] +%else + add tmpq, 32 + mova [dstq], m0 + add dstq, strideq +%endif + ret +ALIGN function_align +.sec: + movsx offq, byte [dirq+8] ; off1_k0 +%if %1 == 4 + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + movq m2, [tmpq+offq+32*0] ; k0s0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k0s1 + movhps m3, [tmpq+offq+32*1] +%else + mova m1, [dstq] + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+0] ; off2_k0 + psubw m2, m1 ; diff_k0s0 + psubw m3, m1 ; diff_k0s1 + pabsw m4, m2 ; adiff_k0s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m0, m6, m5 + pabsw m5, m3 ; adiff_k0s1 + pminsw m0, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k0s2 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k0s3 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + movsx offq, byte [dirq+9] ; off1_k1 + psubw m4, m1 ; diff_k0s2 + psubw m5, m1 ; diff_k0s3 + psignw m2, m3 ; constrain(diff_k0s1) + pabsw m3, m4 ; adiff_k0s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k0s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k0s2) + psubusw m4, m6, m3 + pminsw m4, m2 +%if %1 == 4 + movq m2, [tmpq+offq+32*0] ; k1s0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k1s1 + movhps m3, [tmpq+offq+32*1] +%else + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+1] ; off2_k1 + paddw m0, m7 + psignw m4, m5 ; constrain(diff_k0s3) + paddw m0, m4 ; constrain(diff_k0) + psubw m2, m1 ; diff_k1s0 + psubw m3, m1 ; diff_k1s1 + paddw m0, m0 ; sec_tap_k0 + pabsw m4, m2 ; adiff_k1s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m7, m6, m5 + pabsw m5, m3 ; adiff_k1s1 + pminsw m7, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m7, m2 ; constrain(diff_k1s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k1s2 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k1s3 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + paddw m0, m7 + psubw m4, m1 ; diff_k1s2 + psubw m5, m1 ; diff_k1s3 + psignw m2, m3 ; constrain(diff_k1s1) + pabsw m3, m4 ; adiff_k1s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1s2) + psubusw m4, m6, m3 + pminsw m4, m2 + paddw m0, m7 + psignw m4, m5 ; constrain(diff_k1s3) + paddw m0, m4 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + paddw m0, m1 +%if %1 == 4 + add tmpq, 32*2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] +%else + add tmpq, 32 + mova [dstq], m0 + add dstq, strideq +%endif + ret +ALIGN function_align +.pri_sec: + movsx offq, byte [dirq+8] ; off2_k0 +%if %1 == 4 + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + movq m2, [tmpq+offq+32*0] ; k0s0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k0s1 + movhps m3, [tmpq+offq+32*1] +%else + mova m1, [dstq] + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+0] ; off3_k0 + pabsw m4, m2 +%if ARCH_X86_64 + pabsw m10, m3 + pmaxsw m9, m2, m3 + pminsw m10, m4 +%else + pabsw m7, m3 + pmaxsw m5, m2, m3 + pminsw m4, m7 + mova m9, m5 + mova m10, m4 +%endif + psubw m2, m1 ; diff_k0s0 + psubw m3, m1 ; diff_k0s1 + pabsw m4, m2 ; adiff_k0s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m0, m6, m5 + pabsw m5, m3 ; adiff_k0s1 + pminsw m0, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m0, m2 ; constrain(diff_k0s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k0s2 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k0s3 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + movsx offq, byte [dirq+9] ; off2_k1 + pabsw m7, m4 + psignw m2, m3 + pabsw m3, m5 ; constrain(diff_k0s1) +%if ARCH_X86_64 + pmaxsw m9, m4 + pminsw m10, m7 + pmaxsw m9, m5 + pminsw m10, m3 +%else + pminsw m7, m10 + pminsw m7, m3 + pmaxsw m3, m9, m4 + pmaxsw m3, m5 + mova m10, m7 + mova m9, m3 +%endif + psubw m4, m1 ; diff_k0s2 + psubw m5, m1 ; diff_k0s3 + paddw m0, m2 + pabsw m3, m4 ; adiff_k0s2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k0s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k0s2) + psubusw m4, m6, m3 + pminsw m4, m2 +%if %1 == 4 + movq m2, [tmpq+offq+32*0] ; k1s0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k1s1 + movhps m3, [tmpq+offq+32*1] +%else + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+1] ; off3_k1 + paddw m0, m7 + pabsw m7, m2 + psignw m4, m5 ; constrain(diff_k0s3) + pabsw m5, m3 +%if ARCH_X86_64 + pmaxsw m9, m2 + pminsw m10, m7 + pmaxsw m9, m3 + pminsw m10, m5 +%else + pminsw m7, m10 + pminsw m7, m5 + pmaxsw m5, m9, m2 + pmaxsw m5, m3 + mova m10, m7 + mova m9, m5 +%endif + paddw m0, m4 ; constrain(diff_k0) + psubw m2, m1 ; diff_k1s0 + psubw m3, m1 ; diff_k1s1 + paddw m0, m0 ; sec_tap_k0 + pabsw m4, m2 ; adiff_k1s0 + psrlw m5, m4, [sec_shift+gprsize] + psubusw m7, m6, m5 + pabsw m5, m3 ; adiff_k1s1 + pminsw m7, m4 + psrlw m4, m5, [sec_shift+gprsize] + psignw m7, m2 ; constrain(diff_k1s0) + psubusw m2, m6, m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k1s2 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k1s3 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + movsx offq, byte [dirq+4] ; off1_k0 + paddw m0, m7 + pabsw m7, m4 + psignw m2, m3 ; constrain(diff_k1s1) + pabsw m3, m5 +%if ARCH_X86_64 + pmaxsw m9, m4 + pminsw m10, m7 + pmaxsw m9, m5 + pminsw m10, m3 +%else + pminsw m7, m10 + pminsw m7, m3 + pmaxsw m3, m9, m4 + pmaxsw m3, m5 + mova m10, m7 + mova m9, m3 +%endif + psubw m4, m1 ; diff_k1s2 + psubw m5, m1 ; diff_k1s3 + pabsw m3, m4 ; adiff_k1s2 + paddw m0, m2 + psrlw m2, m3, [sec_shift+gprsize] + psubusw m7, m6, m2 + pabsw m2, m5 ; adiff_k1s3 + pminsw m7, m3 + psrlw m3, m2, [sec_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1s2) + psubusw m4, m6, m3 + pminsw m4, m2 + paddw m0, m7 +%if %1 == 4 + movq m2, [tmpq+offq+32*0] ; k0p0 + movhps m2, [tmpq+offq+32*1] + neg offq + movq m3, [tmpq+offq+32*0] ; k0p1 + movhps m3, [tmpq+offq+32*1] +%else + movu m2, [tmpq+offq] + neg offq + movu m3, [tmpq+offq] +%endif + movsx offq, byte [dirq+5] ; off1_k1 + pabsw m7, m2 + psignw m4, m5 ; constrain(diff_k1s3) + pabsw m5, m3 +%if ARCH_X86_64 + pmaxsw m9, m2 + pminsw m10, m7 + pmaxsw m9, m3 + pminsw m10, m5 +%else + pminsw m7, m10 + pminsw m7, m5 + pmaxsw m5, m9, m2 + pmaxsw m5, m3 + mova m10, m7 + mova m9, m5 +%endif + psubw m2, m1 ; diff_k0p0 + psubw m3, m1 ; diff_k0p1 + paddw m0, m4 + pabsw m4, m2 ; adiff_k0p0 + psrlw m5, m4, [pri_shift+gprsize] + psubusw m7, [rsp+gprsize], m5 + pabsw m5, m3 ; adiff_k0p1 + pminsw m7, m4 + psrlw m4, m5, [pri_shift+gprsize] + psignw m7, m2 ; constrain(diff_k0p0) + psubusw m2, [rsp+gprsize], m4 + pminsw m2, m5 +%if %1 == 4 + movq m4, [tmpq+offq+32*0] ; k1p0 + movhps m4, [tmpq+offq+32*1] + neg offq + movq m5, [tmpq+offq+32*0] ; k1p1 + movhps m5, [tmpq+offq+32*1] +%else + movu m4, [tmpq+offq] + neg offq + movu m5, [tmpq+offq] +%endif + psignw m2, m3 ; constrain(diff_k0p1) + pabsw m3, m4 + paddw m7, m2 ; constrain(diff_k0) + pabsw m2, m5 +%if ARCH_X86_64 + pmaxsw m9, m4 + pminsw m10, m3 + pmaxsw m9, m5 + pminsw m10, m2 +%else + pminsw m3, m10 + pminsw m3, m2 + pmaxsw m2, m9, m4 + pmaxsw m2, m5 + mova m10, m3 + mova m9, m2 +%endif + psubw m4, m1 ; diff_k1p0 + psubw m5, m1 ; diff_k1p1 + pabsw m3, m4 ; adiff_k1p0 + pmullw m7, [priq+16*0] ; pri_tap_k0 + paddw m0, m7 + psrlw m2, m3, [pri_shift+gprsize] + psubusw m7, [rsp+16*0+gprsize], m2 + pabsw m2, m5 ; adiff_k1p1 + pminsw m7, m3 + psrlw m3, m2, [pri_shift+gprsize] + psignw m7, m4 ; constrain(diff_k1p0) + psubusw m4, [rsp+16*0+gprsize], m3 + pminsw m4, m2 + psignw m4, m5 ; constrain(diff_k1p1) + paddw m7, m4 ; constrain(diff_k1) + pmullw m7, [priq+16*1] ; pri_tap_k1 + paddw m0, m7 ; sum + psraw m2, m0, 15 + paddw m0, m2 + pmulhrsw m0, m8 + paddw m0, m1 +%if ARCH_X86_64 + pmaxsw m9, m1 + pminsw m0, m9 +%else + pmaxsw m2, m9, m1 + pminsw m0, m2 +%endif + pminsw m1, m10 + pmaxsw m0, m1 +%if %1 == 4 + add tmpq, 32*2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] +%else + add tmpq, 32 + mova [dstq], m0 + add dstq, strideq +%endif + ret +%endif +%endmacro + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \ + pri, sec, edge + %define px rsp+32*4 +%else +cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left + %define botq topq + %define px rsp+32*5 +%endif + %define base t0-dir_table + %define pri_shift px-16*6 + %define sec_shift px-16*5 + mov edged, r9m + LEA t0, dir_table + movu m0, [dstq+strideq*0] + movu m1, [dstq+strideq*1] + lea t1, [dstq+strideq*2] + movu m2, [t1 +strideq*0] + movu m3, [t1 +strideq*1] + movddup m7, [base+pw_m16384] + mova [px+32*0+0], m0 + mova [px+32*1+0], m1 + mova [px+32*2+0], m2 + mova [px+32*3+0], m3 + test edgeb, 4 ; HAVE_TOP + jz .no_top + movifnidn topq, topmp + movu m0, [topq+strideq*0] + movu m1, [topq+strideq*1] + mova [px-32*2+0], m0 + mova [px-32*1+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd m0, [topq+strideq*0-4] + movd m1, [topq+strideq*1-4] + movd [px-32*2-4], m0 + movd [px-32*1-4], m1 + jmp .top_done +.no_top: + mova [px-32*2+0], m7 + mova [px-32*1+0], m7 +.top_no_left: + movd [px-32*2-4], m7 + movd [px-32*1-4], m7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movifnidn botq, r4mp + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] + mova [px+32*4+0], m0 + mova [px+32*5+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd m0, [botq+strideq*0-4] + movd m1, [botq+strideq*1-4] + movd [px+32*4-4], m0 + movd [px+32*5-4], m1 + jmp .bottom_done +.no_bottom: + mova [px+32*4+0], m7 + mova [px+32*5+0], m7 +.bottom_no_left: + movd [px+32*4-4], m7 + movd [px+32*5-4], m7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movifnidn leftq, r2mp + movd m0, [leftq+4*0] + movd m1, [leftq+4*1] + movd m2, [leftq+4*2] + movd m3, [leftq+4*3] + movd [px+32*0-4], m0 + movd [px+32*1-4], m1 + movd [px+32*2-4], m2 + movd [px+32*3-4], m3 + jmp .left_done +.no_left: + REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5 +.padding_done: + CDEF_FILTER 4, 4 + +%if ARCH_X86_64 +cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ + pri, sec, edge +%else +cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left +%endif + mov edged, r9m + LEA t0, dir_table + movu m0, [dstq+strideq*0] + movu m1, [dstq+strideq*1] + lea t1, [dstq+strideq*2] + movu m2, [t1 +strideq*0] + movu m3, [t1 +strideq*1] + lea t1, [t1 +strideq*2] + movu m4, [t1 +strideq*0] + movu m5, [t1 +strideq*1] + lea t1, [t1 +strideq*2] + movu m6, [t1 +strideq*0] + movu m7, [t1 +strideq*1] + mova [px+32*0+0], m0 + mova [px+32*1+0], m1 + mova [px+32*2+0], m2 + mova [px+32*3+0], m3 + mova [px+32*4+0], m4 + mova [px+32*5+0], m5 + mova [px+32*6+0], m6 + mova [px+32*7+0], m7 + movddup m7, [base+pw_m16384] + test edgeb, 4 ; HAVE_TOP + jz .no_top + movifnidn topq, topmp + movu m0, [topq+strideq*0] + movu m1, [topq+strideq*1] + mova [px-32*2+0], m0 + mova [px-32*1+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd m0, [topq+strideq*0-4] + movd m1, [topq+strideq*1-4] + movd [px-32*2-4], m0 + movd [px-32*1-4], m1 + jmp .top_done +.no_top: + mova [px-32*2+0], m7 + mova [px-32*1+0], m7 +.top_no_left: + movd [px-32*2-4], m7 + movd [px-32*1-4], m7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movifnidn botq, r4mp + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] + mova [px+32*8+0], m0 + mova [px+32*9+0], m1 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd m0, [botq+strideq*0-4] + movd m1, [botq+strideq*1-4] + movd [px+32*8-4], m0 + movd [px+32*9-4], m1 + jmp .bottom_done +.no_bottom: + mova [px+32*8+0], m7 + mova [px+32*9+0], m7 +.bottom_no_left: + movd [px+32*8-4], m7 + movd [px+32*9-4], m7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movifnidn leftq, r2mp + movd m0, [leftq+4*0] + movd m1, [leftq+4*1] + movd m2, [leftq+4*2] + movd m3, [leftq+4*3] + movd [px+32*0-4], m0 + movd [px+32*1-4], m1 + movd [px+32*2-4], m2 + movd [px+32*3-4], m3 + movd m0, [leftq+4*4] + movd m1, [leftq+4*5] + movd m2, [leftq+4*6] + movd m3, [leftq+4*7] + movd [px+32*4-4], m0 + movd [px+32*5-4], m1 + movd [px+32*6-4], m2 + movd [px+32*7-4], m3 + jmp .left_done +.no_left: + REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +.padding_done: + CDEF_FILTER 4, 8 + +%if ARCH_X86_64 +cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ + pri, sec, edge +%else +cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left +%endif + mov edged, r9m + LEA t0, dir_table + mova m0, [dstq+strideq*0+ 0] + movd m1, [dstq+strideq*0+16] + mova m2, [dstq+strideq*1+ 0] + movd m3, [dstq+strideq*1+16] + lea t1, [dstq+strideq*2] + mova m4, [t1 +strideq*0+ 0] + movd m5, [t1 +strideq*0+16] + mova m6, [t1 +strideq*1+ 0] + movd m7, [t1 +strideq*1+16] + lea t1, [t1 +strideq*2] + mova [px+32*0+ 0], m0 + movd [px+32*0+16], m1 + mova [px+32*1+ 0], m2 + movd [px+32*1+16], m3 + mova [px+32*2+ 0], m4 + movd [px+32*2+16], m5 + mova [px+32*3+ 0], m6 + movd [px+32*3+16], m7 + mova m0, [t1 +strideq*0+ 0] + movd m1, [t1 +strideq*0+16] + mova m2, [t1 +strideq*1+ 0] + movd m3, [t1 +strideq*1+16] + lea t1, [t1 +strideq*2] + mova m4, [t1 +strideq*0+ 0] + movd m5, [t1 +strideq*0+16] + mova m6, [t1 +strideq*1+ 0] + movd m7, [t1 +strideq*1+16] + mova [px+32*4+ 0], m0 + movd [px+32*4+16], m1 + mova [px+32*5+ 0], m2 + movd [px+32*5+16], m3 + mova [px+32*6+ 0], m4 + movd [px+32*6+16], m5 + mova [px+32*7+ 0], m6 + movd [px+32*7+16], m7 + movddup m7, [base+pw_m16384] + test edgeb, 4 ; HAVE_TOP + jz .no_top + movifnidn topq, topmp + mova m0, [topq+strideq*0+ 0] + mova m1, [topq+strideq*0+16] + mova m2, [topq+strideq*1+ 0] + mova m3, [topq+strideq*1+16] + mova [px-32*2+ 0], m0 + movd [px-32*2+16], m1 + mova [px-32*1+ 0], m2 + movd [px-32*1+16], m3 + test edgeb, 1 ; HAVE_LEFT + jz .top_no_left + movd m0, [topq+strideq*0-4] + movd m1, [topq+strideq*1-4] + movd [px-32*2-4], m0 + movd [px-32*1-4], m1 + jmp .top_done +.no_top: + mova [px-32*2+ 0], m7 + movd [px-32*2+16], m7 + mova [px-32*1+ 0], m7 + movd [px-32*1+16], m7 +.top_no_left: + movd [px-32*2- 4], m7 + movd [px-32*1- 4], m7 +.top_done: + test edgeb, 8 ; HAVE_BOTTOM + jz .no_bottom + movifnidn botq, r4mp + mova m0, [botq+strideq*0+ 0] + movd m1, [botq+strideq*0+16] + mova m2, [botq+strideq*1+ 0] + movd m3, [botq+strideq*1+16] + mova [px+32*8+ 0], m0 + movd [px+32*8+16], m1 + mova [px+32*9+ 0], m2 + movd [px+32*9+16], m3 + test edgeb, 1 ; HAVE_LEFT + jz .bottom_no_left + movd m0, [botq+strideq*0-4] + movd m1, [botq+strideq*1-4] + movd [px+32*8- 4], m0 + movd [px+32*9- 4], m1 + jmp .bottom_done +.no_bottom: + mova [px+32*8+ 0], m7 + movd [px+32*8+16], m7 + mova [px+32*9+ 0], m7 + movd [px+32*9+16], m7 +.bottom_no_left: + movd [px+32*8- 4], m7 + movd [px+32*9- 4], m7 +.bottom_done: + test edgeb, 1 ; HAVE_LEFT + jz .no_left + movifnidn leftq, r2mp + movd m0, [leftq+4*0] + movd m1, [leftq+4*1] + movd m2, [leftq+4*2] + movd m3, [leftq+4*3] + movd [px+32*0- 4], m0 + movd [px+32*1- 4], m1 + movd [px+32*2- 4], m2 + movd [px+32*3- 4], m3 + movd m0, [leftq+4*4] + movd m1, [leftq+4*5] + movd m2, [leftq+4*6] + movd m3, [leftq+4*7] + movd [px+32*4- 4], m0 + movd [px+32*5- 4], m1 + movd [px+32*6- 4], m2 + movd [px+32*7- 4], m3 + jmp .left_done +.no_left: + REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 +.left_done: + test edgeb, 2 ; HAVE_RIGHT + jnz .padding_done + REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +.padding_done: + CDEF_FILTER 8, 8 + +%macro CDEF_DIR 0 +%if ARCH_X86_64 +cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax + lea r6, [dir_shift] + shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc + movddup m7, [r6+bdmaxq*8] + lea r6, [strideq*3] + mova m0, [srcq+strideq*0] + mova m1, [srcq+strideq*1] + mova m2, [srcq+strideq*2] + mova m3, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + mova m4, [srcq+strideq*0] + mova m5, [srcq+strideq*1] + mova m6, [srcq+strideq*2] + REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhuw m7, [srcq+r6 ] + pxor m8, m8 + packuswb m9, m0, m1 + packuswb m10, m2, m3 + packuswb m11, m4, m5 + packuswb m12, m6, m7 + REPX {psadbw x, m8}, m9, m10, m11, m12 + packssdw m9, m10 + packssdw m11, m12 + packssdw m9, m11 + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main +%else +cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax + mov bdmaxd, bdmaxm + LEA r2, dir_shift + shr bdmaxd, 11 + movddup m7, [r2+bdmaxq*8] + lea r3, [strideq*3] + pmulhuw m3, m7, [srcq+strideq*0] + pmulhuw m4, m7, [srcq+strideq*1] + pmulhuw m5, m7, [srcq+strideq*2] + pmulhuw m6, m7, [srcq+r3 ] + movddup m1, [r2-dir_shift+pw_128] + lea srcq, [srcq+strideq*4] + pxor m0, m0 + packuswb m2, m3, m4 + psubw m3, m1 + psubw m4, m1 + mova [esp+0x00], m3 + mova [esp+0x10], m4 + packuswb m3, m5, m6 + psadbw m2, m0 + psadbw m3, m0 + psubw m5, m1 + psubw m6, m1 + packssdw m2, m3 + mova [esp+0x20], m5 + mova [esp+0x50], m6 + pmulhuw m4, m7, [srcq+strideq*0] + pmulhuw m5, m7, [srcq+strideq*1] + pmulhuw m6, m7, [srcq+strideq*2] + pmulhuw m7, [srcq+r3 ] + packuswb m3, m4, m5 + packuswb m1, m6, m7 + psadbw m3, m0 + psadbw m1, m0 + packssdw m3, m1 + movddup m1, [r2-dir_shift+pw_128] + LEA r2, shufw_6543210x + jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main +%endif +%endmacro + +INIT_XMM ssse3 +CDEF_DIR + +INIT_XMM sse4 +CDEF_DIR diff --git a/third_party/dav1d/src/x86/cdef_avx2.asm b/third_party/dav1d/src/x86/cdef_avx2.asm new file mode 100644 index 0000000000..1f30f8a3b7 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef_avx2.asm @@ -0,0 +1,1772 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +%macro JMP_TABLE 2-* + %xdefine %1_jmptable %%table + %xdefine %%base mangle(private_prefix %+ _%1_avx2) + %%table: + %rep %0 - 1 + dd %%base %+ .%2 - %%table + %rotate 1 + %endrep +%endmacro + +%macro CDEF_FILTER_JMP_TABLE 1 +JMP_TABLE cdef_filter_%1_8bpc, \ + d6k0, d6k1, d7k0, d7k1, \ + d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ + d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ + d0k0, d0k1, d1k0, d1k1 +%endmacro + +SECTION_RODATA 32 + +pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 +blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 + dd 0x80, 0x00, 0x00 +blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 +blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + dd 0x00, 0x00 +blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 + dd 0x0000 +blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 + dd 0x0000, 0x0000 +blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 +blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 +div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 +shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pw_128: times 2 dw 128 +pw_2048: times 2 dw 2048 +tap_table: ; masks for 8 bit shifts + db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 + ; weights + db 4, 2, 3, 3, 2, 1 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + db 1 * 16 + 0, 2 * 16 + 0 + db 1 * 16 + 0, 2 * 16 - 1 + ; the last 6 are repeats of the first 6 so we don't need to & 7 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + +CDEF_FILTER_JMP_TABLE 4x4 +CDEF_FILTER_JMP_TABLE 4x8 +CDEF_FILTER_JMP_TABLE 8x8 + +SECTION .text + +%macro PREP_REGS 2 ; w, h + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] + mov dird, r7m + lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] + lea dirq, [tableq+dirq*2*4] +%if %1 == 4 + %if %2 == 4 + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ + table, dir, dirjmp, stride3, k + %else + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ + table, dir, dirjmp, dst4, stride3, k + lea dst4q, [dstq+strideq*4] + %endif +%else + DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \ + table, dir, dirjmp, top2, stride3, k + mov hq, -8 + lea top1q, [top1q+strideq*0] + lea top2q, [top1q+strideq*1] +%endif +%if %1 == 4 + lea stride3q, [strideq*3] +%endif +%endmacro + +%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max + mov kd, 1 + pxor m15, m15 ; sum +%if %2 == 8 + pxor m12, m12 + %if %1 == 4 + movd xm4, [dstq +strideq*0] + movd xm6, [dstq +strideq*1] + movd xm5, [dstq +strideq*2] + movd xm7, [dstq +stride3q ] + vinserti128 m4, [dst4q+strideq*0], 1 + vinserti128 m6, [dst4q+strideq*1], 1 + vinserti128 m5, [dst4q+strideq*2], 1 + vinserti128 m7, [dst4q+stride3q ], 1 + punpckldq m4, m6 + punpckldq m5, m7 + %else + movq xm4, [dstq+strideq*0] + movq xm5, [dstq+strideq*1] + vinserti128 m4, [dstq+strideq*2], 1 + vinserti128 m5, [dstq+stride3q ], 1 + %endif + punpcklqdq m4, m5 +%else + movd xm4, [dstq+strideq*0] + movd xm5, [dstq+strideq*1] + vinserti128 m4, [dstq+strideq*2], 1 + vinserti128 m5, [dstq+stride3q ], 1 + punpckldq m4, m5 +%endif +%if %3 == 1 + mova m7, m4 ; min + mova m8, m4 ; max +%endif +%endmacro + +%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength + ; mul_tap, w, h, clip + ; load p0/p1 + movsxd dirjmpq, [dirq+kq*4+%1*2*4] + add dirjmpq, tableq + call dirjmpq + +%if %8 == 1 + pmaxub m7, m5 + pminub m8, m5 + pmaxub m7, m6 + pminub m8, m6 +%endif + + ; accumulate sum[m15] over p0/p1 +%if %7 == 4 + punpcklbw m5, m6 + punpcklbw m6, m4, m4 + psubusb m9, m5, m6 + psubusb m5, m6, m5 + por m9, m5 ; abs_diff_p01(p01 - px) + pcmpeqb m5, m9 + por m5, %5 + psignb m6, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + pminub m5, m9 + pmaddubsw m5, m6 + paddw m15, m5 +%else + psubusb m9, m5, m4 + psubusb m5, m4, m5 + psubusb m11, m6, m4 + psubusb m6, m4, m6 + por m9, m5 ; abs_diff_p0(p0 - px) + por m11, m6 ; abs_diff_p1(p1 - px) + pcmpeqb m5, m9 + pcmpeqb m6, m11 + punpckhbw m10, m9, m11 + punpcklbw m9, m11 + por m5, %5 + por m11, m6, %5 + punpckhbw m6, m5, m11 + punpcklbw m5, m11 + psignb m11, %5, m6 + psrlw m6, m10, %2 ; emulate 8-bit shift + pand m6, %3 + psubusb m6, %4, m6 + pminub m6, m10 + pmaddubsw m6, m11 + paddw m12, m6 + psignb m11, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + pminub m5, m9 + pmaddubsw m5, m11 + paddw m15, m5 +%endif +%endmacro + +%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip +%if %2 == 4 + %if %5 == 1 + punpcklbw m4, %3 + %endif + pcmpgtw %3, m15 + paddw m15, %3 + pmulhrsw m15, %4 + %if %5 == 0 + packsswb m15, m15 + paddb m4, m15 + %else + paddw m4, m15 + packuswb m4, m4 ; clip px in [0x0,0xff] + pminub m4, m7 + pmaxub m4, m8 + %endif + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + movd [dstq+strideq*2], xm5 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+stride3q ], xm5, 1 +%else + pcmpgtw m6, %3, m12 + pcmpgtw m5, %3, m15 + paddw m12, m6 + paddw m15, m5 + %if %5 == 1 + punpckhbw m5, m4, %3 + punpcklbw m4, %3 + %endif + pmulhrsw m12, %4 + pmulhrsw m15, %4 + %if %5 == 0 + packsswb m15, m12 + paddb m4, m15 + %else + paddw m5, m12 + paddw m4, m15 + packuswb m4, m5 ; clip px in [0x0,0xff] + pminub m4, m7 + pmaxub m4, m8 + %endif + vextracti128 xm5, m4, 1 + %if %1 == 4 + movd [dstq +strideq*0], xm4 + movd [dst4q+strideq*0], xm5 + pextrd [dstq +strideq*1], xm4, 1 + pextrd [dst4q+strideq*1], xm5, 1 + pextrd [dstq +strideq*2], xm4, 2 + pextrd [dst4q+strideq*2], xm5, 2 + pextrd [dstq +stride3q ], xm4, 3 + pextrd [dst4q+stride3q ], xm5, 3 + %else + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm5 + movhps [dstq+strideq*1], xm4 + movhps [dstq+stride3q ], xm5 + %endif +%endif +%endmacro + +%macro BORDER_PREP_REGS 2 ; w, h + ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] + mov dird, r7m + lea dirq, [tableq+dirq*2+14] +%if %1*%2*2/mmsize > 1 + %if %1 == 4 + DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off + %else + DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off + %endif + mov hd, %1*%2*2/mmsize +%else + DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off +%endif + lea stkq, [px] + pxor m11, m11 +%endmacro + +%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max + mov kd, 1 +%if %1 == 4 + movq xm4, [stkq+32*0] + movhps xm4, [stkq+32*1] + movq xm5, [stkq+32*2] + movhps xm5, [stkq+32*3] + vinserti128 m4, xm5, 1 +%else + mova xm4, [stkq+32*0] ; px + vinserti128 m4, [stkq+32*1], 1 +%endif + pxor m15, m15 ; sum +%if %3 == 1 + mova m7, m4 ; max + mova m8, m4 ; min +%endif +%endmacro + +%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength + ; mul_tap, w, clip + ; load p0/p1 + movsx offq, byte [dirq+kq+%1] ; off1 +%if %6 == 4 + movq xm5, [stkq+offq*2+32*0] ; p0 + movq xm6, [stkq+offq*2+32*2] + movhps xm5, [stkq+offq*2+32*1] + movhps xm6, [stkq+offq*2+32*3] + vinserti128 m5, xm6, 1 +%else + movu xm5, [stkq+offq*2+32*0] ; p0 + vinserti128 m5, [stkq+offq*2+32*1], 1 +%endif + neg offq ; -off1 +%if %6 == 4 + movq xm6, [stkq+offq*2+32*0] ; p1 + movq xm9, [stkq+offq*2+32*2] + movhps xm6, [stkq+offq*2+32*1] + movhps xm9, [stkq+offq*2+32*3] + vinserti128 m6, xm9, 1 +%else + movu xm6, [stkq+offq*2+32*0] ; p1 + vinserti128 m6, [stkq+offq*2+32*1], 1 +%endif +%if %7 == 1 + ; out of bounds values are set to a value that is a both a large unsigned + ; value and a negative signed value. + ; use signed max and unsigned min to remove them + pmaxsw m7, m5 ; max after p0 + pminuw m8, m5 ; min after p0 + pmaxsw m7, m6 ; max after p1 + pminuw m8, m6 ; min after p1 +%endif + + ; accumulate sum[m15] over p0/p1 + ; calculate difference before converting + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + + ; convert to 8-bits with signed saturation + ; saturating to large diffs has no impact on the results + packsswb m5, m6 + + ; group into pairs so we can accumulate using maddubsw + pshufb m5, m12 + pabsb m9, m5 + psignb m10, %5, m5 + psrlw m5, m9, %2 ; emulate 8-bit shift + pand m5, %3 + psubusb m5, %4, m5 + + ; use unsigned min since abs diff can equal 0x80 + pminub m5, m9 + pmaddubsw m5, m10 + paddw m15, m5 +%endmacro + +%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip + pcmpgtw m9, m11, m15 + paddw m15, m9 + pmulhrsw m15, %2 + paddw m4, m15 +%if %3 == 1 + pminsw m4, m7 + pmaxsw m4, m8 +%endif + packuswb m4, m4 + vextracti128 xm5, m4, 1 +%if %1 == 4 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + movd [dstq+strideq*2], xm5 + pextrd [dstq+stride3q ], xm5, 1 +%else + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 +%endif +%endmacro + +%macro CDEF_FILTER 2 ; w, h +INIT_YMM avx2 +cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%assign stack_offset_entry stack_offset + mov edged, edgem + cmp edged, 0xf + jne .border_block + + PUSH r11 + PUSH r12 +%if %2 == 4 +%assign regs_used 13 + ALLOC_STACK 0x60, 16 + pmovzxbw xm0, [leftq+1] + vpermq m0, m0, q0110 + psrldq m1, m0, 4 + vpalignr m2, m0, m0, 12 + movu [rsp+0x10], m0 + movu [rsp+0x28], m1 + movu [rsp+0x40], m2 +%elif %1 == 4 +%assign regs_used 14 + PUSH r13 + ALLOC_STACK 8*2+%1*%2*1, 16 + pmovzxwd m0, [leftq] + mova [rsp+0x10], m0 +%else +%assign regs_used 15 + PUSH r13 + PUSH r14 + ALLOC_STACK 8*4+%1*%2*2+32, 16 + lea r11, [strideq*3] + movu xm4, [dstq+strideq*2] + pmovzxwq m0, [leftq+0] + pmovzxwq m1, [leftq+8] + vinserti128 m4, [dstq+r11], 1 + pmovzxbd m2, [leftq+1] + pmovzxbd m3, [leftq+9] + mov [rsp+16], botq + mova [rsp+0x20], m0 + mova [rsp+0x40], m1 + mova [rsp+0x60], m2 + mova [rsp+0x80], m3 + mova [rsp+0xa0], m4 + lea botq, [dstq+strideq*4] +%endif + + DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping + mov dampingd, r8m + xor zerod, zerod + movifnidn prid, prim + sub dampingd, 31 + movifnidn secdmpd, secdmpm + test prid, prid + jz .sec_only + movd xm0, prid + lzcnt pridmpd, prid + add pridmpd, dampingd + cmovs pridmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + test secdmpd, secdmpd + jz .pri_only + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + mov [rsp+8], secdmpq ; sec_shift + + DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir + vpbroadcastb m0, xm0 ; pri_strength + vpbroadcastb m1, xm1 ; sec_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + lea secq, [tableq+12] ; sec_taps + + PREP_REGS %1, %2 +%if %1*%2 > mmsize +.v_loop: +%endif + LOAD_BLOCK %1, %2, 1 +.k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0 + ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2 + ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2 + dec kq + jge .k_loop + + vpbroadcastd m10, [pw_2048] + pxor m9, m9 + ADJUST_PIXEL %1, %2, m9, m10, 1 +%if %1*%2 > mmsize + lea dstq, [dstq+strideq*4] + lea top1q, [rsp+0xa0] + lea top2q, [rsp+0xb0] + mov botq, [rsp+16] + add hq, 4 + jl .v_loop +%endif + RET + +.pri_only: + DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir + vpbroadcastb m0, xm0 ; pri_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + PREP_REGS %1, %2 + vpbroadcastd m3, [pw_2048] + pxor m1, m1 +%if %1*%2 > mmsize +.pri_v_loop: +%endif + LOAD_BLOCK %1, %2 +.pri_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 + dec kq + jge .pri_k_loop + ADJUST_PIXEL %1, %2, m1, m3 +%if %1*%2 > mmsize + lea dstq, [dstq+strideq*4] + lea top1q, [rsp+0xa0] + lea top2q, [rsp+0xb0] + mov botq, [rsp+16] + add hq, 4 + jl .pri_v_loop +%endif + RET + +.sec_only: + DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + mov [rsp+8], secdmpq ; sec_shift + DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table + lea tableq, [tap_table] + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir + vpbroadcastb m1, xm1 ; sec_strength + lea secq, [tableq+12] ; sec_taps + PREP_REGS %1, %2 + vpbroadcastd m2, [pw_2048] + pxor m0, m0 +%if %1*%2 > mmsize +.sec_v_loop: +%endif + LOAD_BLOCK %1, %2 +.sec_k_loop: + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 + ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 + dec kq + jge .sec_k_loop + ADJUST_PIXEL %1, %2, m0, m2 +%if %1*%2 > mmsize + lea dstq, [dstq+strideq*4] + lea top1q, [rsp+0xa0] + lea top2q, [rsp+0xb0] + mov botq, [rsp+16] + add hq, 4 + jl .sec_v_loop +%endif + RET + +.d0k0: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m6, [dstq+strideq*1-1] + vpbroadcastq m10, [dstq+strideq*2-1] + movd xm5, [topq+strideq*1+1] + movd xm9, [dstq+strideq*0+1] + psrldq m11, m6, 2 + psrldq m12, m10, 2 + vinserti128 m6, [dstq+stride3q -1], 1 + vinserti128 m10, [botq -1], 1 + vpblendd m5, m11, 0x10 + vpblendd m9, m12, 0x10 + movu m11, [blend_4x4+16] + punpckldq m6, m10 + punpckldq m5, m9 + vpblendvb m6, [rsp+gprsize+0x28], m11 + %else + movd xm5, [topq +strideq*1+1] + movq xm6, [dstq +strideq*1-1] + movq xm10, [dstq +stride3q -1] + movq xm11, [dst4q+strideq*1-1] + pinsrd xm5, [dstq +strideq*0+1], 1 + movhps xm6, [dstq +strideq*2-1] + movhps xm10, [dst4q+strideq*0-1] + movhps xm11, [dst4q+strideq*2-1] + psrldq xm9, xm6, 2 + shufps xm5, xm9, q2010 ; -1 +0 +1 +2 + shufps xm6, xm10, q2020 ; +1 +2 +3 +4 + psrldq xm9, xm11, 2 + psrldq xm10, 2 + shufps xm10, xm9, q2020 ; +3 +4 +5 +6 + movd xm9, [dst4q+stride3q -1] + pinsrd xm9, [botq -1], 1 + shufps xm11, xm9, q1020 ; +5 +6 +7 +8 + pmovzxbw m9, [leftq+3] + vinserti128 m6, xm11, 1 + movu m11, [blend_4x8_0+4] + vinserti128 m5, xm10, 1 + vpblendvb m6, m9, m11 + %endif +%else + lea r13, [blend_8x8_0+16] + movq xm5, [top2q +1] + vbroadcasti128 m10, [dstq+strideq*1-1] + vbroadcasti128 m11, [dstq+strideq*2-1] + movhps xm5, [dstq+strideq*0+1] + vinserti128 m6, m10, [dstq+stride3q-1], 1 + vinserti128 m9, m11, [botq -1], 1 + psrldq m10, 2 + psrldq m11, 2 + punpcklqdq m6, m9 + movu m9, [r13+hq*2*1+16*1] + punpcklqdq m10, m11 + vpblendd m5, m10, 0xF0 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9 +%endif + ret +.d1k0: +.d2k0: +.d3k0: +%if %1 == 4 + %if %2 == 4 + movq xm6, [dstq+strideq*0-1] + movq xm9, [dstq+strideq*1-1] + vinserti128 m6, [dstq+strideq*2-1], 1 + vinserti128 m9, [dstq+stride3q -1], 1 + movu m11, [rsp+gprsize+0x10] + pcmpeqd m12, m12 + psrldq m5, m6, 2 + psrldq m10, m9, 2 + psrld m12, 24 + punpckldq m6, m9 + punpckldq m5, m10 + vpblendvb m6, m11, m12 + %else + movq xm6, [dstq +strideq*0-1] + movq xm9, [dstq +strideq*2-1] + movhps xm6, [dstq +strideq*1-1] + movhps xm9, [dstq +stride3q -1] + movq xm10, [dst4q+strideq*0-1] + movhps xm10, [dst4q+strideq*1-1] + psrldq xm5, xm6, 2 + psrldq xm11, xm9, 2 + shufps xm5, xm11, q2020 + movq xm11, [dst4q+strideq*2-1] + movhps xm11, [dst4q+stride3q -1] + shufps xm6, xm9, q2020 + shufps xm9, xm10, xm11, q2020 + vinserti128 m6, xm9, 1 + pmovzxbw m9, [leftq+1] + psrldq xm10, 2 + psrldq xm11, 2 + shufps xm10, xm11, q2020 + vpbroadcastd m11, [blend_4x8_0+4] + vinserti128 m5, xm10, 1 + vpblendvb m6, m9, m11 + %endif +%else + movu xm5, [dstq+strideq*0-1] + movu xm9, [dstq+strideq*1-1] + vinserti128 m5, [dstq+strideq*2-1], 1 + vinserti128 m9, [dstq+stride3q -1], 1 + movu m10, [blend_8x8_0+16] + punpcklqdq m6, m5, m9 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10 + psrldq m5, 2 + psrldq m9, 2 + punpcklqdq m5, m9 +%endif + ret +.d4k0: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m10, [dstq+strideq*1-1] + vpbroadcastq m11, [dstq+strideq*2-1] + movd xm6, [topq+strideq*1-1] + movd xm9, [dstq+strideq*0-1] + psrldq m5, m10, 2 + psrldq m12, m11, 2 + vpblendd m6, m10, 0x10 + vpblendd m9, m11, 0x10 + movu m10, [blend_4x4] + vinserti128 m5, [dstq+stride3q +1], 1 + vinserti128 m12, [botq +1], 1 + punpckldq m6, m9 + punpckldq m5, m12 + vpblendvb m6, [rsp+gprsize+0x40], m10 + %else + movd xm6, [topq +strideq*1-1] + movq xm9, [dstq +strideq*1-1] + movq xm10, [dstq +stride3q -1] + movq xm11, [dst4q+strideq*1-1] + pinsrd xm6, [dstq +strideq*0-1], 1 + movhps xm9, [dstq +strideq*2-1] + movhps xm10, [dst4q+strideq*0-1] + movhps xm11, [dst4q+strideq*2-1] + psrldq xm5, xm9, 2 + shufps xm6, xm9, q2010 + psrldq xm9, xm10, 2 + shufps xm5, xm9, q2020 + shufps xm10, xm11, q2020 + movd xm9, [dst4q+stride3q +1] + vinserti128 m6, xm10, 1 + pinsrd xm9, [botq +1], 1 + psrldq xm11, 2 + pmovzxbw m10, [leftq-1] + shufps xm11, xm9, q1020 + movu m9, [blend_4x8_0] + vinserti128 m5, xm11, 1 + vpblendvb m6, m10, m9 + %endif +%else + lea r13, [blend_8x8_0+8] + movq xm6, [top2q -1] + vbroadcasti128 m5, [dstq+strideq*1-1] + vbroadcasti128 m9, [dstq+strideq*2-1] + movhps xm6, [dstq+strideq*0-1] + movu m11, [r13+hq*2*1+16*1] + punpcklqdq m10, m5, m9 + vinserti128 m5, [dstq+stride3q -1], 1 + vinserti128 m9, [botq -1], 1 + vpblendd m6, m10, 0xF0 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11 + psrldq m5, 2 + psrldq m9, 2 + punpcklqdq m5, m9 +%endif + ret +.d5k0: +.d6k0: +.d7k0: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq+strideq*1 ] + vpbroadcastd m5, [dstq+strideq*1 ] + vpbroadcastd m9, [dstq+strideq*2 ] + vpblendd xm6, [dstq+strideq*0-4], 0x2 + vpblendd m5, m9, 0x22 + vpblendd m6, m5, 0x30 + vinserti128 m5, [dstq+stride3q ], 1 + vpblendd m5, [botq -20], 0x20 + %else + movd xm6, [topq +strideq*1] + movd xm5, [dstq +strideq*1] + movd xm9, [dstq +stride3q ] + movd xm10, [dst4q+strideq*1] + movd xm11, [dst4q+stride3q ] + pinsrd xm6, [dstq +strideq*0], 1 + pinsrd xm5, [dstq +strideq*2], 1 + pinsrd xm9, [dst4q+strideq*0], 1 + pinsrd xm10, [dst4q+strideq*2], 1 + pinsrd xm11, [botq ], 1 + punpcklqdq xm6, xm5 + punpcklqdq xm5, xm9 + punpcklqdq xm9, xm10 + punpcklqdq xm10, xm11 + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + %endif +%else + movq xm6, [top2q ] + movq xm5, [dstq+strideq*1] + movq xm9, [dstq+stride3q ] + movhps xm6, [dstq+strideq*0] + movhps xm5, [dstq+strideq*2] + movhps xm9, [botq ] + vinserti128 m6, xm5, 1 + vinserti128 m5, xm9, 1 +%endif + ret +.d0k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [dstq+strideq*2-2] + movd xm9, [dstq+stride3q -2] + movd xm5, [topq+strideq*0+2] + movd xm10, [topq+strideq*1+2] + pinsrw xm6, [leftq+4], 0 + pinsrw xm9, [leftq+6], 0 + vinserti128 m5, [dstq+strideq*0+2], 1 + vinserti128 m10, [dstq+strideq*1+2], 1 + vinserti128 m6, [botq+strideq*0-2], 1 + vinserti128 m9, [botq+strideq*1-2], 1 + punpckldq m5, m10 + punpckldq m6, m9 + %else + movq xm6, [dstq +strideq*2-2] + movd xm10, [dst4q+strideq*2-2] + movd xm5, [topq +strideq*0+2] + movq xm9, [dst4q+strideq*0-2] + movhps xm6, [dstq +stride3q -2] + pinsrw xm10, [dst4q+stride3q ], 3 + pinsrd xm5, [topq +strideq*1+2], 1 + movhps xm9, [dst4q+strideq*1-2] + pinsrd xm10, [botq +strideq*0-2], 2 + pinsrd xm5, [dstq +strideq*0+2], 2 + pinsrd xm10, [botq +strideq*1-2], 3 + pinsrd xm5, [dstq +strideq*1+2], 3 + shufps xm11, xm6, xm9, q3131 + shufps xm6, xm9, q2020 + movu m9, [blend_4x8_3+8] + vinserti128 m6, xm10, 1 + vinserti128 m5, xm11, 1 + vpblendvb m6, [rsp+gprsize+0x10+8], m9 + %endif +%else + lea r13, [blend_8x8_1+16] + movq xm6, [dstq+strideq*2-2] + movq xm9, [dstq+stride3q -2] + movq xm5, [top1q +2] + movq xm10, [top2q +2] + movu m11, [r13+hq*2*2+16*2] + vinserti128 m6, [botq+strideq*0-2], 1 + vinserti128 m9, [botq+strideq*1-2], 1 + vinserti128 m5, [dstq+strideq*0+2], 1 + vinserti128 m10, [dstq+strideq*1+2], 1 + punpcklqdq m6, m9 + punpcklqdq m5, m10 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11 +%endif + ret +.d1k1: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m6, [dstq+strideq*1-2] + vpbroadcastq m9, [dstq+strideq*2-2] + movd xm5, [topq+strideq*1+2] + movd xm10, [dstq+strideq*0+2] + psrldq m11, m6, 4 + psrldq m12, m9, 4 + vpblendd m5, m11, 0x10 + movq xm11, [leftq+2] + vinserti128 m6, [dstq+stride3q-2], 1 + punpckldq xm11, xm11 + vpblendd m10, m12, 0x10 + pcmpeqd m12, m12 + pmovzxwd m11, xm11 + psrld m12, 16 + punpckldq m6, m9 + vpbroadcastd m9, [botq-2] + vpblendvb m6, m11, m12 + punpckldq m5, m10 + vpblendd m6, m9, 0x20 + %else + movd xm5, [topq +strideq*1+2] + movq xm6, [dstq +strideq*1-2] + movq xm9, [dstq +stride3q -2] + movq xm10, [dst4q+strideq*1-2] + movd xm11, [dst4q+stride3q -2] + pinsrd xm5, [dstq +strideq*0+2], 1 + movhps xm6, [dstq +strideq*2-2] + movhps xm9, [dst4q+strideq*0-2] + movhps xm10, [dst4q+strideq*2-2] + pinsrd xm11, [botq -2], 1 + shufps xm5, xm6, q3110 + shufps xm6, xm9, q2020 + shufps xm9, xm10, q3131 + shufps xm10, xm11, q1020 + movu m11, [blend_4x8_2+4] + vinserti128 m6, xm10, 1 + vinserti128 m5, xm9, 1 + vpblendvb m6, [rsp+gprsize+0x10+4], m11 + %endif +%else + lea r13, [blend_8x8_1+16] + movq xm5, [top2q +2] + vbroadcasti128 m6, [dstq+strideq*1-2] + vbroadcasti128 m9, [dstq+strideq*2-2] + movhps xm5, [dstq+strideq*0+2] + shufps m10, m6, m9, q2121 + vinserti128 m6, [dstq+stride3q -2], 1 + vinserti128 m9, [botq -2], 1 + movu m11, [r13+hq*2*1+16*1] + vpblendd m5, m10, 0xF0 + punpcklqdq m6, m9 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11 +%endif + ret +.d2k1: +%if %1 == 4 + %if %2 == 4 + movq xm11, [leftq] + movq xm6, [dstq+strideq*0-2] + movq xm9, [dstq+strideq*1-2] + vinserti128 m6, [dstq+strideq*2-2], 1 + vinserti128 m9, [dstq+stride3q -2], 1 + punpckldq xm11, xm11 + psrldq m5, m6, 4 + psrldq m10, m9, 4 + pmovzxwd m11, xm11 + punpckldq m6, m9 + punpckldq m5, m10 + pblendw m6, m11, 0x05 + %else + movq xm5, [dstq +strideq*0-2] + movq xm9, [dstq +strideq*2-2] + movq xm10, [dst4q+strideq*0-2] + movq xm11, [dst4q+strideq*2-2] + movhps xm5, [dstq +strideq*1-2] + movhps xm9, [dstq +stride3q -2] + movhps xm10, [dst4q+strideq*1-2] + movhps xm11, [dst4q+stride3q -2] + shufps xm6, xm5, xm9, q2020 + shufps xm5, xm9, q3131 + shufps xm9, xm10, xm11, q2020 + shufps xm10, xm11, q3131 + pmovzxwd m11, [leftq] + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + pblendw m6, m11, 0x55 + %endif +%else + mova m11, [rsp+gprsize+0x20+hq*8+64] + movu xm5, [dstq+strideq*0-2] + movu xm9, [dstq+strideq*1-2] + vinserti128 m5, [dstq+strideq*2-2], 1 + vinserti128 m9, [dstq+stride3q -2], 1 + shufps m6, m5, m9, q1010 + shufps m5, m9, q2121 + pblendw m6, m11, 0x11 +%endif + ret +.d3k1: +%if %1 == 4 + %if %2 == 4 + vpbroadcastq m11, [dstq+strideq*1-2] + vpbroadcastq m12, [dstq+strideq*2-2] + movd xm6, [topq+strideq*1-2] + movd xm9, [dstq+strideq*0-2] + pblendw m11, [leftq-16+2], 0x01 + pblendw m12, [leftq-16+4], 0x01 + pinsrw xm9, [leftq- 0+0], 0 + psrldq m5, m11, 4 + psrldq m10, m12, 4 + vinserti128 m5, [dstq+stride3q +2], 1 + vinserti128 m10, [botq +2], 1 + vpblendd m6, m11, 0x10 + vpblendd m9, m12, 0x10 + punpckldq m6, m9 + punpckldq m5, m10 + %else + movd xm6, [topq +strideq*1-2] + movq xm5, [dstq +strideq*1-2] + movq xm9, [dstq +stride3q -2] + movq xm10, [dst4q+strideq*1-2] + movd xm11, [dst4q+stride3q +2] + pinsrw xm6, [dstq +strideq*0 ], 3 + movhps xm5, [dstq +strideq*2-2] + movhps xm9, [dst4q+strideq*0-2] + movhps xm10, [dst4q+strideq*2-2] + pinsrd xm11, [botq +2], 1 + shufps xm6, xm5, q2010 + shufps xm5, xm9, q3131 + shufps xm9, xm10, q2020 + shufps xm10, xm11, q1031 + movu m11, [blend_4x8_2] + vinserti128 m6, xm9, 1 + vinserti128 m5, xm10, 1 + vpblendvb m6, [rsp+gprsize+0x10-4], m11 + %endif +%else + lea r13, [blend_8x8_1+8] + movq xm6, [top2q -2] + vbroadcasti128 m5, [dstq+strideq*1-2] + vbroadcasti128 m10, [dstq+strideq*2-2] + movhps xm6, [dstq+strideq*0-2] + punpcklqdq m9, m5, m10 + vinserti128 m5, [dstq+stride3q -2], 1 + vinserti128 m10, [botq -2], 1 + movu m11, [r13+hq*2*1+16*1] + vpblendd m6, m9, 0xF0 + shufps m5, m10, q2121 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11 +%endif + ret +.d4k1: +%if %1 == 4 + %if %2 == 4 + vinserti128 m6, [dstq+strideq*0-2], 1 + vinserti128 m9, [dstq+strideq*1-2], 1 + movd xm5, [dstq+strideq*2+2] + movd xm10, [dstq+stride3q +2] + pblendw m6, [leftq-16+0], 0x01 + pblendw m9, [leftq-16+2], 0x01 + vinserti128 m5, [botq+strideq*0+2], 1 + vinserti128 m10, [botq+strideq*1+2], 1 + vpblendd m6, [topq+strideq*0-2], 0x01 + vpblendd m9, [topq+strideq*1-2], 0x01 + punpckldq m5, m10 + punpckldq m6, m9 + %else + movd xm6, [topq +strideq*0-2] + movq xm5, [dstq +strideq*2-2] + movq xm9, [dst4q+strideq*0-2] + movd xm10, [dst4q+strideq*2+2] + pinsrd xm6, [topq +strideq*1-2], 1 + movhps xm5, [dstq +stride3q -2] + movhps xm9, [dst4q+strideq*1-2] + pinsrd xm10, [dst4q+stride3q +2], 1 + pinsrd xm6, [dstq +strideq*0-2], 2 + pinsrd xm10, [botq +strideq*0+2], 2 + pinsrd xm6, [dstq +strideq*1-2], 3 + pinsrd xm10, [botq +strideq*1+2], 3 + shufps xm11, xm5, xm9, q2020 + shufps xm5, xm9, q3131 + movu m9, [blend_4x8_3] + vinserti128 m6, xm11, 1 + vinserti128 m5, xm10, 1 + vpblendvb m6, [rsp+gprsize+0x10-8], m9 + %endif +%else + lea r13, [blend_8x8_1] + movu m11, [r13+hq*2*2+16*2] + movq xm6, [top1q -2] + movq xm9, [top2q -2] + movq xm5, [dstq+strideq*2+2] + movq xm10, [dstq+stride3q +2] + vinserti128 m6, [dstq+strideq*0-2], 1 + vinserti128 m9, [dstq+strideq*1-2], 1 + vinserti128 m5, [botq+strideq*0+2], 1 + vinserti128 m10, [botq+strideq*1+2], 1 + punpcklqdq m6, m9 + vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11 + punpcklqdq m5, m10 +%endif + ret +.d5k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq+strideq*0-1] + movd xm9, [topq+strideq*1-1] + movd xm5, [dstq+strideq*2+1] + movd xm10, [dstq+stride3q +1] + pcmpeqd m12, m12 + pmovzxbw m11, [leftq-8+1] + psrld m12, 24 + vinserti128 m6, [dstq+strideq*0-1], 1 + vinserti128 m9, [dstq+strideq*1-1], 1 + vinserti128 m5, [botq+strideq*0+1], 1 + vinserti128 m10, [botq+strideq*1+1], 1 + punpckldq m6, m9 + pxor m9, m9 + vpblendd m12, m9, 0x0F + punpckldq m5, m10 + vpblendvb m6, m11, m12 + %else + movd xm6, [topq +strideq*0-1] + movq xm5, [dstq +strideq*2-1] + movq xm9, [dst4q+strideq*0-1] + movd xm10, [dst4q+strideq*2+1] + pinsrd xm6, [topq +strideq*1-1], 1 + movhps xm5, [dstq +stride3q -1] + movhps xm9, [dst4q+strideq*1-1] + pinsrd xm10, [dst4q+stride3q +1], 1 + pinsrd xm6, [dstq +strideq*0-1], 2 + pinsrd xm10, [botq +strideq*0+1], 2 + pinsrd xm6, [dstq +strideq*1-1], 3 + pinsrd xm10, [botq +strideq*1+1], 3 + shufps xm11, xm5, xm9, q2020 + vinserti128 m6, xm11, 1 + pmovzxbw m11, [leftq-3] + psrldq xm5, 2 + psrldq xm9, 2 + shufps xm5, xm9, q2020 + movu m9, [blend_4x8_1] + vinserti128 m5, xm10, 1 + vpblendvb m6, m11, m9 + %endif +%else + lea r13, [blend_8x8_0] + movu m11, [r13+hq*2*2+16*2] + movq xm6, [top1q -1] + movq xm9, [top2q -1] + movq xm5, [dstq+strideq*2+1] + movq xm10, [dstq+stride3q +1] + vinserti128 m6, [dstq+strideq*0-1], 1 + vinserti128 m9, [dstq+strideq*1-1], 1 + vinserti128 m5, [botq+strideq*0+1], 1 + vinserti128 m10, [botq+strideq*1+1], 1 + punpcklqdq m6, m9 + punpcklqdq m5, m10 + vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11 +%endif + ret +.d6k1: +%if %1 == 4 + %if %2 == 4 + movd xm6, [topq+strideq*0] + movd xm9, [topq+strideq*1] + movd xm5, [dstq+strideq*2] + movd xm10, [dstq+stride3q ] + vinserti128 m6, [dstq+strideq*0], 1 + vinserti128 m9, [dstq+strideq*1], 1 + vinserti128 m5, [botq+strideq*0], 1 + vinserti128 m10, [botq+strideq*1], 1 + punpckldq m6, m9 + punpckldq m5, m10 + %else + movd xm5, [dstq +strideq*2] + movd xm6, [topq +strideq*0] + movd xm9, [dst4q+strideq*2] + pinsrd xm5, [dstq +stride3q ], 1 + pinsrd xm6, [topq +strideq*1], 1 + pinsrd xm9, [dst4q+stride3q ], 1 + pinsrd xm5, [dst4q+strideq*0], 2 + pinsrd xm6, [dstq +strideq*0], 2 + pinsrd xm9, [botq +strideq*0], 2 + pinsrd xm5, [dst4q+strideq*1], 3 + pinsrd xm6, [dstq +strideq*1], 3 + pinsrd xm9, [botq +strideq*1], 3 + vinserti128 m6, xm5, 1 + vinserti128 m5, xm9, 1 + %endif +%else + movq xm5, [dstq+strideq*2] + movq xm9, [botq+strideq*0] + movq xm6, [top1q ] + movq xm10, [dstq+strideq*0] + movhps xm5, [dstq+stride3q ] + movhps xm9, [botq+strideq*1] + movhps xm6, [top2q ] + movhps xm10, [dstq+strideq*1] + vinserti128 m5, xm9, 1 + vinserti128 m6, xm10, 1 +%endif + ret +.d7k1: +%if %1 == 4 + %if %2 == 4 + movd xm5, [dstq+strideq*2-1] + movd xm9, [dstq+stride3q -1] + movd xm6, [topq+strideq*0+1] + movd xm10, [topq+strideq*1+1] + pinsrb xm5, [leftq+ 5], 0 + pinsrb xm9, [leftq+ 7], 0 + vinserti128 m6, [dstq+strideq*0+1], 1 + vinserti128 m10, [dstq+strideq*1+1], 1 + vinserti128 m5, [botq+strideq*0-1], 1 + vinserti128 m9, [botq+strideq*1-1], 1 + punpckldq m6, m10 + punpckldq m5, m9 + %else + movd xm6, [topq +strideq*0+1] + movq xm9, [dstq +strideq*2-1] + movq xm10, [dst4q+strideq*0-1] + movd xm11, [dst4q+strideq*2-1] + pinsrd xm6, [topq +strideq*1+1], 1 + movhps xm9, [dstq +stride3q -1] + movhps xm10, [dst4q+strideq*1-1] + pinsrd xm11, [dst4q+stride3q -1], 1 + pinsrd xm6, [dstq +strideq*0+1], 2 + pinsrd xm11, [botq +strideq*0-1], 2 + pinsrd xm6, [dstq +strideq*1+1], 3 + pinsrd xm11, [botq +strideq*1-1], 3 + shufps xm5, xm9, xm10, q2020 + vinserti128 m5, xm11, 1 + pmovzxbw m11, [leftq+5] + psrldq xm9, 2 + psrldq xm10, 2 + shufps xm9, xm10, q2020 + movu m10, [blend_4x8_1+8] + vinserti128 m6, xm9, 1 + vpblendvb m5, m11, m10 + %endif +%else + lea r13, [blend_8x8_0+16] + movq xm5, [dstq+strideq*2-1] + movq xm9, [botq+strideq*0-1] + movq xm6, [top1q +1] + movq xm10, [dstq+strideq*0+1] + movhps xm5, [dstq+stride3q -1] + movhps xm9, [botq+strideq*1-1] + movhps xm6, [top2q +1] + movhps xm10, [dstq+strideq*1+1] + movu m11, [r13+hq*2*2+16*2] + vinserti128 m5, xm9, 1 + vinserti128 m6, xm10, 1 + vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11 +%endif + ret + +.border_block: + DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge +%define rstk rsp +%assign stack_offset stack_offset_entry +%assign regs_used 11 + ALLOC_STACK 2*16+(%2+4)*32, 16 +%define px rsp+2*16+2*32 + + pcmpeqw m14, m14 + psllw m14, 15 ; 0x8000 + + ; prepare pixel buffers - body/right +%if %1 == 4 + INIT_XMM avx2 +%endif +%if %2 == 8 + lea dst4q, [dstq+strideq*4] +%endif + lea stride3q, [strideq*3] + test edgeb, 2 ; have_right + jz .no_right + pmovzxbw m1, [dstq+strideq*0] + pmovzxbw m2, [dstq+strideq*1] + pmovzxbw m3, [dstq+strideq*2] + pmovzxbw m4, [dstq+stride3q] + mova [px+0*32], m1 + mova [px+1*32], m2 + mova [px+2*32], m3 + mova [px+3*32], m4 +%if %2 == 8 + pmovzxbw m1, [dst4q+strideq*0] + pmovzxbw m2, [dst4q+strideq*1] + pmovzxbw m3, [dst4q+strideq*2] + pmovzxbw m4, [dst4q+stride3q] + mova [px+4*32], m1 + mova [px+5*32], m2 + mova [px+6*32], m3 + mova [px+7*32], m4 +%endif + jmp .body_done +.no_right: +%if %1 == 4 + movd xm1, [dstq+strideq*0] + movd xm2, [dstq+strideq*1] + movd xm3, [dstq+strideq*2] + movd xm4, [dstq+stride3q] + pmovzxbw xm1, xm1 + pmovzxbw xm2, xm2 + pmovzxbw xm3, xm3 + pmovzxbw xm4, xm4 + movq [px+0*32], xm1 + movq [px+1*32], xm2 + movq [px+2*32], xm3 + movq [px+3*32], xm4 +%else + pmovzxbw xm1, [dstq+strideq*0] + pmovzxbw xm2, [dstq+strideq*1] + pmovzxbw xm3, [dstq+strideq*2] + pmovzxbw xm4, [dstq+stride3q] + mova [px+0*32], xm1 + mova [px+1*32], xm2 + mova [px+2*32], xm3 + mova [px+3*32], xm4 +%endif + movd [px+0*32+%1*2], xm14 + movd [px+1*32+%1*2], xm14 + movd [px+2*32+%1*2], xm14 + movd [px+3*32+%1*2], xm14 +%if %2 == 8 + %if %1 == 4 + movd xm1, [dst4q+strideq*0] + movd xm2, [dst4q+strideq*1] + movd xm3, [dst4q+strideq*2] + movd xm4, [dst4q+stride3q] + pmovzxbw xm1, xm1 + pmovzxbw xm2, xm2 + pmovzxbw xm3, xm3 + pmovzxbw xm4, xm4 + movq [px+4*32], xm1 + movq [px+5*32], xm2 + movq [px+6*32], xm3 + movq [px+7*32], xm4 + %else + pmovzxbw xm1, [dst4q+strideq*0] + pmovzxbw xm2, [dst4q+strideq*1] + pmovzxbw xm3, [dst4q+strideq*2] + pmovzxbw xm4, [dst4q+stride3q] + mova [px+4*32], xm1 + mova [px+5*32], xm2 + mova [px+6*32], xm3 + mova [px+7*32], xm4 + %endif + movd [px+4*32+%1*2], xm14 + movd [px+5*32+%1*2], xm14 + movd [px+6*32+%1*2], xm14 + movd [px+7*32+%1*2], xm14 +%endif +.body_done: + + ; top + test edgeb, 4 ; have_top + jz .no_top + test edgeb, 1 ; have_left + jz .top_no_left + test edgeb, 2 ; have_right + jz .top_no_right + pmovzxbw m1, [topq+strideq*0-(%1/2)] + pmovzxbw m2, [topq+strideq*1-(%1/2)] + movu [px-2*32-%1], m1 + movu [px-1*32-%1], m2 + jmp .top_done +.top_no_right: + pmovzxbw m1, [topq+strideq*0-%1] + pmovzxbw m2, [topq+strideq*1-%1] + movu [px-2*32-%1*2], m1 + movu [px-1*32-%1*2], m2 + movd [px-2*32+%1*2], xm14 + movd [px-1*32+%1*2], xm14 + jmp .top_done +.top_no_left: + test edgeb, 2 ; have_right + jz .top_no_left_right + pmovzxbw m1, [topq+strideq*0] + pmovzxbw m2, [topq+strideq*1] + mova [px-2*32+0], m1 + mova [px-1*32+0], m2 + movd [px-2*32-4], xm14 + movd [px-1*32-4], xm14 + jmp .top_done +.top_no_left_right: +%if %1 == 4 + movd xm1, [topq+strideq*0] + pinsrd xm1, [topq+strideq*1], 1 + pmovzxbw xm1, xm1 + movq [px-2*32+0], xm1 + movhps [px-1*32+0], xm1 +%else + pmovzxbw xm1, [topq+strideq*0] + pmovzxbw xm2, [topq+strideq*1] + mova [px-2*32+0], xm1 + mova [px-1*32+0], xm2 +%endif + movd [px-2*32-4], xm14 + movd [px-1*32-4], xm14 + movd [px-2*32+%1*2], xm14 + movd [px-1*32+%1*2], xm14 + jmp .top_done +.no_top: + movu [px-2*32-%1], m14 + movu [px-1*32-%1], m14 +.top_done: + + ; left + test edgeb, 1 ; have_left + jz .no_left + pmovzxbw xm1, [leftq+ 0] +%if %2 == 8 + pmovzxbw xm2, [leftq+ 8] +%endif + movd [px+0*32-4], xm1 + pextrd [px+1*32-4], xm1, 1 + pextrd [px+2*32-4], xm1, 2 + pextrd [px+3*32-4], xm1, 3 +%if %2 == 8 + movd [px+4*32-4], xm2 + pextrd [px+5*32-4], xm2, 1 + pextrd [px+6*32-4], xm2, 2 + pextrd [px+7*32-4], xm2, 3 +%endif + jmp .left_done +.no_left: + movd [px+0*32-4], xm14 + movd [px+1*32-4], xm14 + movd [px+2*32-4], xm14 + movd [px+3*32-4], xm14 +%if %2 == 8 + movd [px+4*32-4], xm14 + movd [px+5*32-4], xm14 + movd [px+6*32-4], xm14 + movd [px+7*32-4], xm14 +%endif +.left_done: + + ; bottom + DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge + test edgeb, 8 ; have_bottom + jz .no_bottom + test edgeb, 1 ; have_left + jz .bottom_no_left + test edgeb, 2 ; have_right + jz .bottom_no_right + pmovzxbw m1, [botq+strideq*0-(%1/2)] + pmovzxbw m2, [botq+strideq*1-(%1/2)] + movu [px+(%2+0)*32-%1], m1 + movu [px+(%2+1)*32-%1], m2 + jmp .bottom_done +.bottom_no_right: + pmovzxbw m1, [botq+strideq*0-%1] + pmovzxbw m2, [botq+strideq*1-%1] + movu [px+(%2+0)*32-%1*2], m1 + movu [px+(%2+1)*32-%1*2], m2 +%if %1 == 8 + movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu +%endif + movd [px+(%2+0)*32+%1*2], xm14 + movd [px+(%2+1)*32+%1*2], xm14 + jmp .bottom_done +.bottom_no_left: + test edgeb, 2 ; have_right + jz .bottom_no_left_right + pmovzxbw m1, [botq+strideq*0] + pmovzxbw m2, [botq+strideq*1] + mova [px+(%2+0)*32+0], m1 + mova [px+(%2+1)*32+0], m2 + movd [px+(%2+0)*32-4], xm14 + movd [px+(%2+1)*32-4], xm14 + jmp .bottom_done +.bottom_no_left_right: +%if %1 == 4 + movd xm1, [botq+strideq*0] + pinsrd xm1, [botq+strideq*1], 1 + pmovzxbw xm1, xm1 + movq [px+(%2+0)*32+0], xm1 + movhps [px+(%2+1)*32+0], xm1 +%else + pmovzxbw xm1, [botq+strideq*0] + pmovzxbw xm2, [botq+strideq*1] + mova [px+(%2+0)*32+0], xm1 + mova [px+(%2+1)*32+0], xm2 +%endif + movd [px+(%2+0)*32-4], xm14 + movd [px+(%2+1)*32-4], xm14 + movd [px+(%2+0)*32+%1*2], xm14 + movd [px+(%2+1)*32+%1*2], xm14 + jmp .bottom_done +.no_bottom: + movu [px+(%2+0)*32-%1], m14 + movu [px+(%2+1)*32-%1], m14 +.bottom_done: + + ; actual filter + INIT_YMM avx2 + DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero +%undef edged + ; register to shuffle values into after packing + vbroadcasti128 m12, [shufb_lohi] + + mov dampingd, r8m + xor zerod, zerod + movifnidn prid, prim + sub dampingd, 31 + movifnidn secdmpd, secdmpm + test prid, prid + jz .border_sec_only + movd xm0, prid + lzcnt pridmpd, prid + add pridmpd, dampingd + cmovs pridmpd, zerod + mov [rsp+0], pridmpq ; pri_shift + test secdmpd, secdmpd + jz .border_pri_only + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + mov [rsp+8], secdmpq ; sec_shift + + DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3 + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + + ; pri/sec_taps[k] [4 total] + DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3 + vpbroadcastb m0, xm0 ; pri_strength + vpbroadcastb m1, xm1 ; sec_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + lea secq, [tableq+12] ; sec_taps + + BORDER_PREP_REGS %1, %2 +%if %1*%2*2/mmsize > 1 +.border_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2, 1 +.border_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1 + ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1 + ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1 + dec kq + jge .border_k_loop + + vpbroadcastd m10, [pw_2048] + BORDER_ADJUST_PIXEL %1, m10, 1 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_v_loop +%endif + RET + +.border_pri_only: + DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3 + lea tableq, [tap_table] + vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask + DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3 + vpbroadcastb m0, xm0 ; pri_strength + and prid, 1 + lea priq, [tableq+priq*2+8] ; pri_taps + BORDER_PREP_REGS %1, %2 + vpbroadcastd m1, [pw_2048] +%if %1*%2*2/mmsize > 1 +.border_pri_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2 +.border_pri_k_loop: + vpbroadcastb m2, [priq+kq] ; pri_taps + ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 + dec kq + jge .border_pri_k_loop + BORDER_ADJUST_PIXEL %1, m1 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_pri_v_loop +%endif + RET + +.border_sec_only: + DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3 + movd xm1, secdmpd + lzcnt secdmpd, secdmpd + add secdmpd, dampingd + mov [rsp+8], secdmpq ; sec_shift + DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3 + lea tableq, [tap_table] + vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask + DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3 + vpbroadcastb m1, xm1 ; sec_strength + lea secq, [tableq+12] ; sec_taps + BORDER_PREP_REGS %1, %2 + vpbroadcastd m0, [pw_2048] +%if %1*%2*2/mmsize > 1 +.border_sec_v_loop: +%endif + BORDER_LOAD_BLOCK %1, %2 +.border_sec_k_loop: + vpbroadcastb m3, [secq+kq] ; sec_taps + ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 + ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 + dec kq + jge .border_sec_k_loop + BORDER_ADJUST_PIXEL %1, m0 +%if %1*%2*2/mmsize > 1 + %define vloop_lines (mmsize/(%1*2)) + lea dstq, [dstq+strideq*vloop_lines] + add stkq, 32*vloop_lines + dec hd + jg .border_sec_v_loop +%endif + RET +%endmacro + +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 + +INIT_YMM avx2 +cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 + lea stride3q, [strideq*3] + movq xm0, [srcq+strideq*0] + movq xm1, [srcq+strideq*1] + movq xm2, [srcq+strideq*2] + movq xm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m4, [srcq+stride3q ] + vpbroadcastq m5, [srcq+strideq*2] + vpblendd m0, m4, 0xf0 + vpblendd m1, m5, 0xf0 + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m5, [srcq+strideq*0] + vpblendd m2, m4, 0xf0 + vpblendd m3, m5, 0xf0 + pxor m4, m4 + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 +cglobal_label .main + vpbroadcastd m4, [pw_128] + PROLOGUE 3, 4, 15 + psubw m0, m4 + psubw m1, m4 + psubw m2, m4 + psubw m3, m4 + + ; shuffle registers to generate partial_sum_diag[0-1] together + vperm2i128 m7, m0, m0, 0x01 + vperm2i128 m6, m1, m1, 0x01 + vperm2i128 m5, m2, m2, 0x01 + vperm2i128 m4, m3, m3, 0x01 + + ; start with partial_sum_hv[0-1] + paddw m8, m0, m1 + paddw m9, m2, m3 + phaddw m10, m0, m1 + phaddw m11, m2, m3 + paddw m8, m9 + phaddw m10, m11 + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + paddw xm8, xm9 ; partial_sum_hv[1] + phaddw xm10, xm11 ; partial_sum_hv[0] + vinserti128 m8, xm10, 1 + vpbroadcastd m9, [div_table+44] + pmaddwd m8, m8 + pmulld m8, m9 ; cost6[2a-d] | cost2[a-d] + + ; create aggregates [lower half]: + ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+ + ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0 + ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+ + ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x + ; and [upper half]: + ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+ + ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567 + ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+ + ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx + ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd + + pslldq m9, m1, 2 + psrldq m10, m1, 14 + pslldq m11, m2, 4 + psrldq m12, m2, 12 + pslldq m13, m3, 6 + psrldq m14, m3, 10 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m4, 8 + psrldq m12, m4, 8 + pslldq m13, m5, 10 + psrldq m14, m5, 6 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m6, 12 + psrldq m12, m6, 4 + pslldq m13, m7, 14 + psrldq m14, m7, 2 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero] + vbroadcasti128 m14, [shufw_6543210x] + vbroadcasti128 m13, [div_table+16] + vbroadcasti128 m12, [div_table+0] + paddw m9, m0 ; partial_sum_diag[0/1][0-7] + pshufb m10, m14 + punpckhwd m11, m9, m10 + punpcklwd m9, m10 + pmaddwd m11, m11 + pmaddwd m9, m9 + pmulld m11, m13 + pmulld m9, m12 + paddd m9, m11 ; cost0[a-d] | cost4[a-d] + + ; merge horizontally and vertically for partial_sum_alt[0-3] + paddw m10, m0, m1 + paddw m11, m2, m3 + paddw m12, m4, m5 + paddw m13, m6, m7 + phaddw m0, m4 + phaddw m1, m5 + phaddw m2, m6 + phaddw m3, m7 + + ; create aggregates [lower half]: + ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234 + ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx + ; and [upper half]: + ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 + ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx + ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd + + pslldq m4, m11, 2 + psrldq m11, 14 + pslldq m5, m12, 4 + psrldq m12, 12 + pslldq m6, m13, 6 + psrldq m13, 10 + paddw m4, m10 + paddw m11, m12 + vpbroadcastd m12, [div_table+44] + paddw m5, m6 + paddw m11, m13 ; partial_sum_alt[3/2] right + vbroadcasti128 m13, [div_table+32] + paddw m4, m5 ; partial_sum_alt[3/2] left + pshuflw m5, m11, q3012 + punpckhwd m6, m11, m4 + punpcklwd m4, m5 + pmaddwd m6, m6 + pmaddwd m4, m4 + pmulld m6, m12 + pmulld m4, m13 + paddd m4, m6 ; cost7[a-d] | cost5[a-d] + + ; create aggregates [lower half]: + ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234 + ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx + ; and [upper half]: + ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 + ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx + ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd + + pslldq m5, m1, 2 + psrldq m1, 14 + pslldq m6, m2, 4 + psrldq m2, 12 + pslldq m7, m3, 6 + psrldq m3, 10 + paddw m5, m0 + paddw m1, m2 + paddw m6, m7 + paddw m1, m3 ; partial_sum_alt[0/1] right + paddw m5, m6 ; partial_sum_alt[0/1] left + pshuflw m0, m1, q3012 + punpckhwd m1, m5 + punpcklwd m5, m0 + pmaddwd m1, m1 + pmaddwd m5, m5 + pmulld m1, m12 + pmulld m5, m13 + paddd m5, m1 ; cost1[a-d] | cost3[a-d] + + mova xm0, [pd_47130256+ 16] + mova m1, [pd_47130256] + phaddd m9, m8 + phaddd m5, m4 + phaddd m9, m5 + vpermd m0, m9 ; cost[0-3] + vpermd m1, m9 ; cost[4-7] | cost[0-3] + + ; now find the best cost + pmaxsd xm2, xm0, xm1 + pshufd xm3, xm2, q1032 + pmaxsd xm2, xm3 + pshufd xm3, xm2, q2301 + pmaxsd xm2, xm3 ; best cost + + ; find the idx using minpos + ; make everything other than the best cost negative via subtraction + ; find the min of unsigned 16-bit ints to sort out the negative values + psubd xm4, xm1, xm2 + psubd xm3, xm0, xm2 + packssdw xm3, xm4 + phminposuw xm3, xm3 + + ; convert idx to 32-bits + psrld xm3, 16 + movd eax, xm3 + + ; get idx^4 complement + vpermd m3, m1 + psubd xm2, xm3 + psrld xm2, 10 + movd [varq], xm2 + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/cdef_avx512.asm b/third_party/dav1d/src/x86/cdef_avx512.asm new file mode 100644 index 0000000000..b4f9c008ca --- /dev/null +++ b/third_party/dav1d/src/x86/cdef_avx512.asm @@ -0,0 +1,860 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +%macro DUP4 1-* + %rep %0 + times 4 db %1 + %rotate 1 + %endrep +%endmacro + +%macro DIRS 16 ; cdef_directions[] + %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 + ; masking away unused bits allows us to use a single vpaddd {1to16} + ; instruction instead of having to do vpbroadcastd + paddb + db %13 & 0x3f, -%13 & 0x3f + %rotate 1 + %endrep +%endmacro + +SECTION_RODATA 64 + +lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 + db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 + db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 + db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 +lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 + db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 +lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 + db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 + db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 + db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 +pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 +lut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55 + db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 +lut_perm_8x8b: db 12, 13, 0, 1, 2, 3, 4, 5, 14, 15, 16, 17, 18, 19, 20, 21 + db 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25 + db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53 + db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57 +end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 +end_perm_clip: db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 + db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 + db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 + db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 +edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 + dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 + dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 + dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 + dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 + dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 + dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 + dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 +px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 +cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 +gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 + dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 + dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 + dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 +pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 +sec_tap: db 32, 32, 16, 16 +pd_268435568: dd 268435568 + +SECTION .text + +%if WIN64 +DECLARE_REG_TMP 4 +%else +DECLARE_REG_TMP 8 +%endif + +; lut: +; t0 t1 t2 t3 t4 t5 t6 t7 +; T0 T1 T2 T3 T4 T5 T6 T7 +; L0 L1 00 01 02 03 04 05 +; L2 L3 10 11 12 13 14 15 +; L4 L5 20 21 22 23 24 25 +; L6 L7 30 31 32 33 34 35 +; b0 b1 b2 b3 b4 b5 b6 b7 +; B0 B1 B2 B3 B4 B5 B6 B7 + +INIT_ZMM avx512icl +cglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r7-edge_mask + movq xmm0, [dstq+strideq*0] + movhps xmm0, [dstq+strideq*1] + lea r7, [edge_mask] + movq xmm1, [topq+strideq*0-2] + movhps xmm1, [topq+strideq*1-2] + mov r6d, edgem + vinserti32x4 ym0, ymm0, [leftq], 1 + lea r2, [strideq*3] + vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 + mova m5, [base+lut_perm_4x4] + vinserti32x4 m0, [dstq+r2], 2 + test r6b, 0x08 ; avoid buffer overread + jz .main + vinserti32x4 m1, [botq+strideq*0-4], 2 + vinserti32x4 m0, [botq+strideq*1-4], 3 +.main: + movifnidn prid, prim + mov t0d, dirm + mova m3, [base+px_idx] + mov r3d, dampingm + vpermi2b m5, m0, m1 ; lut + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m7, m7 + lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m6, m3, m5 ; px + cmp r6d, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 +%macro CDEF_FILTER_4x4_PRI 0 + vpcmpub k1, m6, m1, 6 ; px > pN + psubb m2, m1, m6 + lzcnt r6d, prid + vpsubb m2{k1}, m6, m1 ; abs(diff) + vpbroadcastb m4, prid + and prid, 1 + vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift + movifnidn secd, secm + vpbroadcastd m10, [base+pri_tap+priq*4] + vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) + psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) + pminub m2, m4 + vpdpbusd m0, m2, m10 ; sum +%endmacro + CDEF_FILTER_4x4_PRI + test secd, secd + jz .end_no_clip + call .sec +.end_clip: + pminub m4, m6, m1 + pmaxub m1, m6 + pminub m5, m2, m3 + pmaxub m2, m3 + pminub m4, m5 + pmaxub m2, m1 + psrldq m1, m4, 2 + psrldq m3, m2, 2 + pminub m1, m4 + vpcmpw k1, m0, m7, 1 + vpshldd m6, m0, 8 + pmaxub m2, m3 + pslldq m3, m1, 1 + psubw m7, m0 + paddusw m0, m6 ; clip >0xff + vpsubusw m0{k1}, m6, m7 ; clip <0x00 + pslldq m4, m2, 1 + pminub m1, m3 + pmaxub m2, m4 + pmaxub m0, m1 + pminub m0, m2 + jmp .end +.sec_only: + movifnidn secd, secm + call .sec +.end_no_clip: + vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) + paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) +.end: + mova xm1, [base+end_perm] + vpermb m0, m1, m0 ; output in bits 8-15 of each dword + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + RET +.mask_edges_sec_only: + movifnidn secd, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + vpbroadcastq m8, [base+edge_mask+r6*8] + test prid, prid + jz .mask_edges_sec_only + vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m8, m2 ; index in-range + mova m1, m6 + vpermb m1{k1}, m2, m5 + CDEF_FILTER_4x4_PRI + test secd, secd + jz .end_no_clip + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m8, m4 + mova m2, m6 + vpermb m2{k1}, m4, m5 + vpshufbitqmb k1, m8, m9 + mova m3, m6 + vpermb m3{k1}, m9, m5 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 + vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 +.sec_main: + vpbroadcastd m8, [base+sec_tap] + vpcmpub k1, m6, m2, 6 + psubb m4, m2, m6 + vpbroadcastb m12, secd + lzcnt secd, secd + vpsubb m4{k1}, m6, m2 + vpcmpub k2, m6, m3, 6 + vpbroadcastq m11, [r3+secq*8] + gf2p8affineqb m10, m4, m11, 0 + psubb m5, m3, m6 + mova m9, m8 + vpsubb m8{k1}, m7, m8 + psubusb m10, m12, m10 + vpsubb m5{k2}, m6, m3 + pminub m4, m10 + vpdpbusd m0, m4, m8 + gf2p8affineqb m11, m5, m11, 0 + vpsubb m9{k2}, m7, m9 + psubusb m12, m11 + pminub m5, m12 + vpdpbusd m0, m5, m9 + ret + +DECLARE_REG_TMP 2, 7 + +; lut top lut bottom +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 +; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 +; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 +; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 +; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 +; L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 +; La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 + +cglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r8-edge_mask + vpbroadcastd ym21, strided + mov r6d, edgem + lea r8, [edge_mask] + movq xm1, [topq+strideq*0-2] + pmulld ym21, [base+pd_01234567] + kxnorb k1, k1, k1 + movq xm2, [topq+strideq*1-2] + vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 + mova m14, [base+lut_perm_4x8a] + movu m15, [base+lut_perm_4x8b] + test r6b, 0x08 ; avoid buffer overread + jz .main + vinserti32x4 ym1, [botq+strideq*0-2], 1 + vinserti32x4 ym2, [botq+strideq*1-2], 1 +.main: + punpcklqdq ym1, ym2 + vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ + movifnidn prid, prim + mov t0d, dirm + mova m16, [base+px_idx] + mov r3d, dampingm + vpermi2b m14, m0, m1 ; lut top + vpermi2b m15, m0, m1 ; lut bottom + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m20, m20 + lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m2, m16, m14 ; pxt + vpermb m3, m16, m15 ; pxb + mova m1, m0 + cmp r6b, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 + vpermb m5, m6, m15 ; pNb +%macro CDEF_FILTER_4x8_PRI 0 + vpcmpub k1, m2, m4, 6 ; pxt > pNt + vpcmpub k2, m3, m5, 6 ; pxb > pNb + psubb m6, m4, m2 + psubb m7, m5, m3 + lzcnt r6d, prid + vpsubb m6{k1}, m2, m4 ; abs(diff_top) + vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) + vpbroadcastb m13, prid + vpbroadcastq m9, [r3+r6*8] + and prid, 1 + vpbroadcastd m11, [base+pri_tap+priq*4] + vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift + vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift + mova m10, m11 + movifnidn t1d, secm + vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) + vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) + psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) + psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) + pminub m6, m12 + pminub m7, m13 + vpdpbusd m0, m6, m10 ; sum top + vpdpbusd m1, m7, m11 ; sum bottom +%endmacro + CDEF_FILTER_4x8_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m10, m4, m2 + pminub m12, m6, m8 + pminub m11, m5, m3 + pminub m13, m7, m9 + pmaxub m4, m2 + pmaxub m6, m8 + pmaxub m5, m3 + pmaxub m7, m9 + pminub m10, m12 + pminub m11, m13 + pmaxub m4, m6 + pmaxub m5, m7 + mov r2d, 0xAAAAAAAA + kmovd k1, r2d + kxnorb k2, k2, k2 ; hw lw + vpshrdd m12, m0, m1, 16 ; m1lw m0hw + vpshrdd m6, m10, m11, 16 ; m11lw m10hw + vpshrdd m8, m4, m5, 16 ; m5lw m4hw + vpblendmw m7{k1}, m10, m11 ; m11hw m10lw + vpblendmw m9{k1}, m4, m5 ; m5hw m4lw + vpblendmw m4{k1}, m0, m12 ; m1lw m0lw + vpblendmw m5{k1}, m12, m1 ; m1hw m0hw + vpshrdd m2, m3, 16 + pminub m6, m7 + pmaxub m8, m9 + mova ym14, [base+end_perm] + vpcmpw k1, m4, m20, 1 + vpshldw m2, m5, 8 + pslldq m7, m6, 1 + pslldq m9, m8, 1 + psubw m5, m20, m4 + paddusw m0, m4, m2 ; clip >0xff + pminub m6, m7 + pmaxub m8, m9 + psubusw m0{k1}, m2, m5 ; clip <0x00 + pmaxub m0, m6 + pminub m0, m8 + vpermb m0, m14, m0 + vpscatterdd [dstq+ym21]{k2}, ym0 + RET +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + mova ym4, [base+end_perm] + kxnorb k1, k1, k1 + vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m3, m1, 8 + paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddw m1, m3 + pslld m0, 16 + vpshrdd m0, m1, 16 + vpermb m0, m4, m0 ; output in bits 8-15 of each word + vpscatterdd [dstq+ym21]{k1}, ym0 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + mov t1d, r6d + or r6d, 8 ; top 4x4 has bottom + or t1d, 4 ; bottom 4x4 has top + vpbroadcastq m17, [base+edge_mask+r6*8] + vpbroadcastq m18, [base+edge_mask+t1*8] + test prid, prid + jz .mask_edges_sec_only + vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m17, m6 ; index in-range + vpshufbitqmb k2, m18, m6 + mova m4, m2 + mova m5, m3 + vpermb m4{k1}, m6, m14 + vpermb m5{k2}, m6, m15 + CDEF_FILTER_4x8_PRI + test t1d, t1d + jz .end_no_clip + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m17, m10 + vpshufbitqmb k2, m18, m10 + vpshufbitqmb k3, m17, m11 + vpshufbitqmb k4, m18, m11 + mova m6, m2 + mova m7, m3 + mova m8, m2 + mova m9, m3 + vpermb m6{k1}, m10, m14 + vpermb m7{k2}, m10, m15 + vpermb m8{k3}, m11, m14 + vpermb m9{k4}, m11, m15 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 + vpermb m7, m8, m15 ; pNb + vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 + vpermb m9, m9, m15 ; pNb +.sec_main: + vpbroadcastb m18, t1d + lzcnt t1d, t1d + vpcmpub k1, m2, m6, 6 + vpcmpub k2, m3, m7, 6 + vpcmpub k3, m2, m8, 6 + vpcmpub k4, m3, m9, 6 + vpbroadcastq m17, [r3+t1*8] + psubb m10, m6, m2 + psubb m11, m7, m3 + psubb m12, m8, m2 + psubb m13, m9, m3 + vpsubb m10{k1}, m2, m6 ; abs(dt0) + vpsubb m11{k2}, m3, m7 ; abs(db0) + vpsubb m12{k3}, m2, m8 ; abs(dt1) + vpsubb m13{k4}, m3, m9 ; abs(db1) + vpbroadcastd m19, [base+sec_tap] + gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift + gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift + gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift + gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift + psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) + psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) + psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) + psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) + pminub m10, m14 + pminub m11, m15 + pminub m12, m16 + pminub m13, m17 + mova m14, m19 + mova m15, m19 + mova m16, m19 + vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) + vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) + vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) + vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) + vpdpbusd m0, m10, m14 + vpdpbusd m1, m11, m15 + vpdpbusd m0, m12, m16 + vpdpbusd m1, m13, m19 + ret + +; lut tl lut tr +; t0 t1 t2 t3 t4 t5 t6 t7 t4 t5 t6 t7 t8 t9 ta tb +; T0 T1 T2 T3 T4 T5 T6 T7 T4 T5 T6 T7 T8 T9 Ta Tb +; L0 L1 00 01 02 03 04 05 02 03 04 05 06 07 08 09 +; L2 L3 10 11 12 13 14 15 12 13 14 15 16 17 18 19 +; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 +; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 +; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 +; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 +; lut bl lut br +; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 +; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 +; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 +; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 +; Lc Ld 60 61 62 63 64 65 62 63 64 65 66 67 68 69 +; Le Lf 70 71 72 73 74 75 72 73 74 75 76 77 78 79 +; b0 b1 b2 b3 b4 b5 b6 b7 b4 b5 b6 b7 b8 b9 ba bb +; B0 B1 B2 B3 B4 B5 B6 B7 B4 B5 B6 B7 B8 B9 Ba Bb + +cglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \ + pri, sec, dir, damping, edge +%define base r8-edge_mask + movu xm16, [dstq+strideq*0] + pinsrd xm16, [leftq+4*0], 3 + mov r6d, edgem + vinserti128 ym16, [dstq+strideq*1], 1 + lea r10, [dstq+strideq*4] + movu xm17, [dstq+strideq*2] + vinserti32x4 m16, [topq+strideq*0-2], 2 + lea r9, [strideq*3] + pinsrd xm17, [leftq+4*1], 3 + vinserti32x4 m16, [topq+strideq*1-2], 3 ; 0 1 t T + lea r8, [edge_mask] + vinserti128 ym17, [dstq+r9 ], 1 + vpbroadcastd ym18, [leftq+4*2] + vpblendd ym17, ym18, 0x80 + movu xm18, [r10 +strideq*2] + vinserti32x4 m17, [r10 +strideq*0], 2 + pinsrd xm18, [leftq+4*3], 3 + vinserti32x4 m17, [r10 +strideq*1], 3 ; 2 3 4 5 + vinserti128 ym18, [r10 +r9 ], 1 + test r6b, 0x08 ; avoid buffer overread + jz .main + vinserti32x4 m18, [botq+strideq*0-2], 2 + vinserti32x4 m18, [botq+strideq*1-2], 3 ; 6 7 b B +.main: + mova m0, [base+lut_perm_8x8a] + movu m1, [base+lut_perm_8x8b] + mova m30, [base+px_idx] + vpermb m16, m0, m16 + movifnidn prid, prim + vpermb m17, m1, m17 + mov t0d, dirm + vpermb m18, m0, m18 + mov r3d, dampingm + vshufi32x4 m12, m16, m17, q2020 ; lut tl + vshufi32x4 m13, m16, m17, q3131 ; lut tr + vshufi32x4 m14, m17, m18, q0220 ; lut bl + vshufi32x4 m15, m17, m18, q1331 ; lut br + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m31, m31 + lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m4, m30, m12 ; pxtl + mova m1, m0 + vpermb m5, m30, m13 ; pxtr + mova m2, m0 + vpermb m6, m30, m14 ; pxbl + mova m3, m0 + vpermb m7, m30, m15 ; pxbr + cmp r6b, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 + vpermb m9, m11, m13 ; pNtr + vpermb m10, m11, m14 ; pNbl + vpermb m11, m11, m15 ; pNbr +%macro CDEF_FILTER_8x8_PRI 0 + vpcmpub k1, m4, m8, 6 ; pxtl > pNtl + vpcmpub k2, m5, m9, 6 ; pxtr > pNtr + vpcmpub k3, m6, m10, 6 ; pxbl > pNbl + vpcmpub k4, m7, m11, 6 ; pxbr > pNbr + psubb m16, m8, m4 + psubb m17, m9, m5 + psubb m18, m10, m6 + psubb m19, m11, m7 + lzcnt r6d, prid + vpsubb m16{k1}, m4, m8 ; abs(diff_tl) + vpsubb m17{k2}, m5, m9 ; abs(diff_tr) + vpsubb m18{k3}, m6, m10 ; abs(diff_bl) + vpsubb m19{k4}, m7, m11 ; abs(diff_br) + vpbroadcastq m28, [r3+r6*8] + vpbroadcastb m29, prid + and prid, 1 + vpbroadcastd m27, [base+pri_tap+priq*4] + vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift + vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift + vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift + vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift + mova m24, m27 + mova m25, m27 + mova m26, m27 + movifnidn t1d, secm + vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) + vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) + vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) + vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) + psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) + psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) + psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) + psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) + pminub m16, m20 + pminub m17, m21 + pminub m18, m22 + pminub m19, m23 + vpdpbusd m0, m16, m24 ; sum tl + vpdpbusd m1, m17, m25 ; sum tr + vpdpbusd m2, m18, m26 ; sum bl + vpdpbusd m3, m19, m27 ; sum br +%endmacro + CDEF_FILTER_8x8_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m20, m8, m4 + pminub m24, m12, m16 + pminub m21, m9, m5 + pminub m25, m13, m17 + pminub m22, m10, m6 + pminub m26, m14, m18 + pminub m23, m11, m7 + pminub m27, m15, m19 + pmaxub m8, m4 + pmaxub m12, m16 + pmaxub m9, m5 + pmaxub m13, m17 + pmaxub m10, m6 + pmaxub m14, m18 + pmaxub m11, m7 + pmaxub m15, m19 + pminub m20, m24 + pminub m21, m25 + pminub m22, m26 + pminub m23, m27 + pmaxub m8, m12 + pmaxub m9, m13 + pmaxub m10, m14 + pmaxub m11, m15 + mov r2d, 0xAAAAAAAA + kmovd k1, r2d + vpshrdd m24, m0, m1, 16 + vpshrdd m25, m2, m3, 16 + vpshrdd m12, m20, m21, 16 + vpshrdd m14, m22, m23, 16 + vpshrdd m16, m8, m9, 16 + vpshrdd m18, m10, m11, 16 + vpblendmw m13{k1}, m20, m21 + vpblendmw m15{k1}, m22, m23 + vpblendmw m17{k1}, m8, m9 + vpblendmw m19{k1}, m10, m11 + vpblendmw m20{k1}, m0, m24 + vpblendmw m21{k1}, m24, m1 + vpblendmw m22{k1}, m2, m25 + vpblendmw m23{k1}, m25, m3 + vpshrdd m4, m5, 16 + vpshrdd m6, m7, 16 + pminub m12, m13 + pminub m14, m15 + pmaxub m16, m17 + pmaxub m18, m19 + mova m8, [base+end_perm_clip] + vpcmpw k2, m20, m31, 1 + vpcmpw k3, m22, m31, 1 + vpshldw m4, m21, 8 + vpshldw m6, m23, 8 + kunpckdq k1, k1, k1 + kxnorb k4, k4, k4 + vpshrdw m11, m12, m14, 8 + vpshrdw m15, m16, m18, 8 + vpblendmb m13{k1}, m12, m14 + vpblendmb m17{k1}, m16, m18 + psubw m21, m31, m20 + psubw m23, m31, m22 + paddusw m0, m20, m4 ; clip >0xff + paddusw m1, m22, m6 + pminub m11, m13 + pmaxub m15, m17 + psubusw m0{k2}, m4, m21 ; clip <0x00 + psubusw m1{k3}, m6, m23 + psrlw m0, 8 + vmovdqu8 m0{k1}, m1 + pmaxub m0, m11 + pminub m0, m15 + vpermb m0, m8, m0 + vextracti32x4 xm1, m0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*2], xm1 + movq [r10 +strideq*0], xm2 + movq [r10 +strideq*2], xm3 + movhps [dstq+strideq*1], xm0 + movhps [dstq+r9 ], xm1 + movhps [r10 +strideq*1], xm2 + movhps [r10 +r9 ], xm3 + RET +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + mova xm8, [base+end_perm] + kxnorb k1, k1, k1 + vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m5, m1, 8 + vpshldd m6, m2, 8 + vpshldd m7, m3, 8 + paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + vpermb m0, m8, m0 + vpermb m1, m8, m1 + vpermb m2, m8, m2 + vpermb m3, m8, m3 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm0 + movq [r10 +strideq*0], xm5 + movq [r10 +strideq*2], xm2 + movhps [dstq+strideq*1], xm4 + movhps [dstq+r9 ], xm0 + movhps [r10 +strideq*1], xm5 + movhps [r10 +r9 ], xm2 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + mov t0d, r6d + mov t1d, r6d + or t0d, 0xA ; top-left 4x4 has bottom and right + or t1d, 0x9 ; top-right 4x4 has bottom and left + vpbroadcastq m26, [base+edge_mask+t0*8] + vpbroadcastq m27, [base+edge_mask+t1*8] + mov t1d, r6d + or r6d, 0x6 ; bottom-left 4x4 has top and right + or t1d, 0x5 ; bottom-right 4x4 has top and left + vpbroadcastq m28, [base+edge_mask+r6*8] + vpbroadcastq m29, [base+edge_mask+t1*8] + mov t0d, dirm + test prid, prid + jz .mask_edges_sec_only + vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m26, m20 ; index in-range + vpshufbitqmb k2, m27, m20 + vpshufbitqmb k3, m28, m20 + vpshufbitqmb k4, m29, m20 + mova m8, m4 + mova m9, m5 + mova m10, m6 + mova m11, m7 + vpermb m8{k1}, m20, m12 + vpermb m9{k2}, m20, m13 + vpermb m10{k3}, m20, m14 + vpermb m11{k4}, m20, m15 + mova [rsp+0x00], m26 + mova [rsp+0x40], m27 + mova [rsp+0x80], m28 + mova [rsp+0xC0], m29 + CDEF_FILTER_8x8_PRI + test t1d, t1d + jz .end_no_clip + mova m26, [rsp+0x00] + mova m27, [rsp+0x40] + mova m28, [rsp+0x80] + mova m29, [rsp+0xC0] + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m26, m20 + vpshufbitqmb k2, m27, m20 + vpshufbitqmb k3, m28, m20 + vpshufbitqmb k4, m29, m20 + mova m16, m4 + mova m17, m5 + mova m18, m6 + mova m19, m7 + vpermb m16{k1}, m20, m12 + vpermb m17{k2}, m20, m13 + vpermb m18{k3}, m20, m14 + vpermb m19{k4}, m20, m15 + vpshufbitqmb k1, m26, m21 + vpshufbitqmb k2, m27, m21 + vpshufbitqmb k3, m28, m21 + vpshufbitqmb k4, m29, m21 + vpermb m12, m21, m12 + vpermb m13, m21, m13 + vpermb m14, m21, m14 + vpermb m15, m21, m15 + vpblendmb m12{k1}, m4, m12 + vpblendmb m13{k2}, m5, m13 + vpblendmb m14{k3}, m6, m14 + vpblendmb m15{k4}, m7, m15 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 + vpermb m17, m20, m13 ; pNtr + vpermb m18, m20, m14 ; pNbl + vpermb m19, m20, m15 ; pNbr + vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 + vpermb m13, m21, m13 ; pNtr + vpermb m14, m21, m14 ; pNbl + vpermb m15, m21, m15 ; pNbr +.sec_main: +%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants + vpcmpub k1, m4, %1, 6 + vpcmpub k2, m5, %2, 6 + vpcmpub k3, m6, %3, 6 + vpcmpub k4, m7, %4, 6 + psubb m20, %1, m4 + psubb m21, %2, m5 + psubb m22, %3, m6 + psubb m23, %4, m7 +%if %5 + vpbroadcastb m28, t1d + lzcnt t1d, t1d + vpbroadcastq m29, [r3+t1*8] +%endif + vpsubb m20{k1}, m4, %1 + vpsubb m21{k2}, m5, %2 + vpsubb m22{k3}, m6, %3 + vpsubb m23{k4}, m7, %4 + gf2p8affineqb m24, m20, m29, 0 + gf2p8affineqb m25, m21, m29, 0 + gf2p8affineqb m26, m22, m29, 0 + gf2p8affineqb m27, m23, m29, 0 +%if %5 + vpbroadcastd m30, [base+sec_tap] +%endif + psubusb m24, m28, m24 + psubusb m25, m28, m25 + psubusb m26, m28, m26 + psubusb m27, m28, m27 + pminub m20, m24 + pminub m21, m25 + pminub m22, m26 + pminub m23, m27 + mova m24, m30 + mova m25, m30 + mova m26, m30 + mova m27, m30 + vpsubb m24{k1}, m31, m30 + vpsubb m25{k2}, m31, m30 + vpsubb m26{k3}, m31, m30 + vpsubb m27{k4}, m31, m30 + vpdpbusd m0, m20, m24 + vpdpbusd m1, m21, m25 + vpdpbusd m2, m22, m26 + vpdpbusd m3, m23, m27 +%endmacro + CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 + CDEF_FILTER_8x8_SEC m12, m13, m14, m15 + ret + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/cdef_sse.asm b/third_party/dav1d/src/x86/cdef_sse.asm new file mode 100644 index 0000000000..1b353121f4 --- /dev/null +++ b/third_party/dav1d/src/x86/cdef_sse.asm @@ -0,0 +1,1357 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; Copyright © 2019, VideoLabs +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%macro DUP8 1-* + %rep %0 + times 8 db %1 + %rotate 1 + %endrep +%endmacro + +div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 + dd 420, 210, 140, 105, 105, 105, 105, 105 +div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 + dw 168, 168, 140, 140, 120, 120, 105, 105 + dw 420, 420, 210, 210, 140, 140, 105, 105 + dw 105, 105, 105, 105, 105, 105, 105, 105 +const shufw_6543210x, \ + db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 +shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pw_8: times 8 dw 8 +pw_128: times 8 dw 128 +pw_256: times 8 dw 256 +pw_2048: times 8 dw 2048 +pw_0x7FFF: times 8 dw 0x7FFF +pw_0x8000: times 8 dw 0x8000 +tap_table: ; masks for 8-bit shift emulation + DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 + ; weights + DUP8 4, 2, 3, 3, 2, 1 + ; taps indices + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + db 1 * 16 + 0, 2 * 16 + 0 + db 1 * 16 + 0, 2 * 16 - 1 + ; the last 6 are repeats of the first 6 so we don't need to & 7 + db -1 * 16 + 1, -2 * 16 + 2 + db 0 * 16 + 1, -1 * 16 + 2 + db 0 * 16 + 1, 0 * 16 + 2 + db 0 * 16 + 1, 1 * 16 + 2 + db 1 * 16 + 1, 2 * 16 + 2 + db 1 * 16 + 0, 2 * 16 + 1 + +SECTION .text + +%macro movif32 2 + %if ARCH_X86_32 + mov %1, %2 + %endif +%endmacro + +%macro PMOVZXBW 2-3 0 ; %3 = half + %if cpuflag(sse4) && %3 == 0 + pmovzxbw %1, %2 + %else + %if %3 == 1 + movd %1, %2 + %else + movq %1, %2 + %endif + punpcklbw %1, m7 + %endif +%endmacro + +%macro PSHUFB_0 2 + %if cpuflag(ssse3) + pshufb %1, %2 + %else + punpcklbw %1, %1 + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 + %endif +%endmacro + +%macro MOVDDUP 2 +%if cpuflag(ssse3) + movddup %1, %2 +%else + movq %1, %2 + punpcklqdq %1, %1 +%endif +%endmacro + +%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax + ; load p0/p1 + movsx offq, byte [dirq+kq+%1+14*8] ; off1 + %if %6 == 4 + movq m5, [stkq+offq*2+32*0] ; p0 + movhps m5, [stkq+offq*2+32*1] + %else + movu m5, [stkq+offq*2+32*0] ; p0 + %endif + neg offq ; -off1 + %if %6 == 4 + movq m6, [stkq+offq*2+32*0] ; p1 + movhps m6, [stkq+offq*2+32*1] + %else + movu m6, [stkq+offq*2+32*0] ; p1 + %endif + %if %7 + %if cpuflag(sse4) + ; out of bounds values are set to a value that is a both a large unsigned + ; value and a negative signed value. + ; use signed max and unsigned min to remove them + pmaxsw m7, m5 + pminuw m8, m5 + pmaxsw m7, m6 + pminuw m8, m6 + %else + pcmpeqw m3, m14, m5 + pminsw m8, m5 ; min after p0 + pandn m3, m5 + pmaxsw m7, m3 ; max after p0 + pcmpeqw m3, m14, m6 + pminsw m8, m6 ; min after p1 + pandn m3, m6 + pmaxsw m7, m3 ; max after p1 + %endif + %endif + + ; accumulate sum[m13] over p0/p1 + psubw m5, m4 ; diff_p0(p0 - px) + psubw m6, m4 ; diff_p1(p1 - px) + packsswb m5, m6 ; convert pixel diff to 8-bit + %if cpuflag(ssse3) + pshufb m5, m13 ; group diffs p0 and p1 into pairs + pabsb m6, m5 + psignb m3, %5, m5 + %else + movlhps m6, m5 + punpckhbw m6, m5 + pxor m5, m5 + pcmpgtb m5, m6 + paddb m6, m5 + pxor m6, m5 + paddb m3, %5, m5 + pxor m3, m5 + %endif + pand m9, %3, m6 ; emulate 8-bit shift + psrlw m9, %2 + psubusb m5, %4, m9 + pminub m5, m6 ; constrain(diff_p) + %if cpuflag(ssse3) + pmaddubsw m5, m3 ; constrain(diff_p) * taps + %else + psrlw m9, m5, 8 + psraw m6, m3, 8 + psllw m5, 8 + psllw m3, 8 + pmullw m9, m6 + pmulhw m5, m3 + paddw m5, m9 + %endif + paddw m0, m5 +%endmacro + +%macro LOAD_BODY 3 ; dst, src, block_width + %if %3 == 4 + PMOVZXBW m0, [%2+strideq*0] + PMOVZXBW m1, [%2+strideq*1] + PMOVZXBW m2, [%2+strideq*2] + PMOVZXBW m3, [%2+stride3q] + mova [%1+32*0], m0 + mova [%1+32*1], m1 + mova [%1+32*2], m2 + mova [%1+32*3], m3 + %else + movu m0, [%2+strideq*0] + movu m1, [%2+strideq*1] + movu m2, [%2+strideq*2] + movu m3, [%2+stride3q] + punpcklbw m4, m0, m7 + punpckhbw m0, m7 + mova [%1+32*0+ 0], m4 + mova [%1+32*0+16], m0 + punpcklbw m4, m1, m7 + punpckhbw m1, m7 + mova [%1+32*1+ 0], m4 + mova [%1+32*1+16], m1 + punpcklbw m4, m2, m7 + punpckhbw m2, m7 + mova [%1+32*2+ 0], m4 + mova [%1+32*2+16], m2 + punpcklbw m4, m3, m7 + punpckhbw m3, m7 + mova [%1+32*3+ 0], m4 + mova [%1+32*3+16], m3 + %endif +%endmacro + +%macro CDEF_FILTER_END 2 ; w, minmax + pxor m6, m6 + pcmpgtw m6, m0 + paddw m0, m6 + %if cpuflag(ssse3) + pmulhrsw m0, m15 + %else + paddw m0, m15 + psraw m0, 4 + %endif + paddw m4, m0 + %if %2 + pminsw m4, m7 + pmaxsw m4, m8 + %endif + packuswb m4, m4 + %if %1 == 4 + movd [dstq+strideq*0], m4 + psrlq m4, 32 + movd [dstq+strideq*1], m4 + add stkq, 32*2 + lea dstq, [dstq+strideq*2] + %else + movq [dstq], m4 + add stkq, 32 + add dstq, strideq + %endif +%endmacro + +%macro CDEF_FILTER 2 ; w, h + %if ARCH_X86_64 +cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \ + dst, stride, left, top, bot, pri, dst4, edge, \ + stride3 + %define px rsp+3*16+2*32 + %define base 0 + %else +cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ + dst, stride, left, edge, stride3 + %define topq r2 + %define botq r2 + %define dst4q r2 + LEA r5, tap_table + %define px esp+7*16+2*32 + %define base r5-tap_table + %endif + mov edged, r9m + %if cpuflag(sse4) + %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] + %else + %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF] + %endif + mova m6, OUT_OF_BOUNDS_MEM + pxor m7, m7 + + ; prepare pixel buffers - body/right + %if %2 == 8 + lea dst4q, [dstq+strideq*4] + %endif + lea stride3q, [strideq*3] + test edgeb, 2 ; have_right + jz .no_right + LOAD_BODY px, dstq, %1 + %if %2 == 8 + LOAD_BODY px+4*32, dst4q, %1 + %endif + jmp .body_done +.no_right: + PMOVZXBW m0, [dstq+strideq*0], %1 == 4 + PMOVZXBW m1, [dstq+strideq*1], %1 == 4 + PMOVZXBW m2, [dstq+strideq*2], %1 == 4 + PMOVZXBW m3, [dstq+stride3q ], %1 == 4 + mova [px+32*0], m0 + mova [px+32*1], m1 + mova [px+32*2], m2 + mova [px+32*3], m3 + movd [px+32*0+%1*2], m6 + movd [px+32*1+%1*2], m6 + movd [px+32*2+%1*2], m6 + movd [px+32*3+%1*2], m6 + %if %2 == 8 + PMOVZXBW m0, [dst4q+strideq*0], %1 == 4 + PMOVZXBW m1, [dst4q+strideq*1], %1 == 4 + PMOVZXBW m2, [dst4q+strideq*2], %1 == 4 + PMOVZXBW m3, [dst4q+stride3q ], %1 == 4 + mova [px+32*4], m0 + mova [px+32*5], m1 + mova [px+32*6], m2 + mova [px+32*7], m3 + movd [px+32*4+%1*2], m6 + movd [px+32*5+%1*2], m6 + movd [px+32*6+%1*2], m6 + movd [px+32*7+%1*2], m6 + %endif +.body_done: + + ; top + movifnidn topq, r3mp + test edgeb, 4 ; have_top + jz .no_top + test edgeb, 1 ; have_left + jz .top_no_left + test edgeb, 2 ; have_right + jz .top_no_right + %if %1 == 4 + PMOVZXBW m0, [topq+strideq*0-2] + PMOVZXBW m1, [topq+strideq*1-2] + %else + movu m0, [topq+strideq*0-4] + movu m1, [topq+strideq*1-4] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movu [px-32*2+8], m2 + movu [px-32*1+8], m3 + %endif + movu [px-32*2-%1], m0 + movu [px-32*1-%1], m1 + jmp .top_done +.top_no_right: + %if %1 == 4 + PMOVZXBW m0, [topq+strideq*0-%1] + PMOVZXBW m1, [topq+strideq*1-%1] + movu [px-32*2-8], m0 + movu [px-32*1-8], m1 + %else + movu m0, [topq+strideq*0-%1] + movu m1, [topq+strideq*1-%2] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px-32*2-16], m0 + mova [px-32*2+ 0], m2 + mova [px-32*1-16], m1 + mova [px-32*1+ 0], m3 + %endif + movd [px-32*2+%1*2], m6 + movd [px-32*1+%1*2], m6 + jmp .top_done +.top_no_left: + test edgeb, 2 ; have_right + jz .top_no_left_right + %if %1 == 4 + PMOVZXBW m0, [topq+strideq*0] + PMOVZXBW m1, [topq+strideq*1] + %else + movu m0, [topq+strideq*0] + movu m1, [topq+strideq*1] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movd [px-32*2+16], m2 + movd [px-32*1+16], m3 + %endif + movd [px-32*2- 4], m6 + movd [px-32*1- 4], m6 + mova [px-32*2+ 0], m0 + mova [px-32*1+ 0], m1 + jmp .top_done +.top_no_left_right: + PMOVZXBW m0, [topq+strideq*0], %1 == 4 + PMOVZXBW m1, [topq+strideq*1], %1 == 4 + movd [px-32*2-4], m6 + movd [px-32*1-4], m6 + mova [px-32*2+0], m0 + mova [px-32*1+0], m1 + movd [px-32*2+%1*2], m6 + movd [px-32*1+%1*2], m6 + jmp .top_done +.no_top: + movu [px-32*2- 4], m6 + movu [px-32*1- 4], m6 + %if %1 == 8 + movq [px-32*2+12], m6 + movq [px-32*1+12], m6 + %endif +.top_done: + + ; left + test edgeb, 1 ; have_left + jz .no_left + movifnidn leftq, leftmp + %if %2 == 4 + movq m0, [leftq] + %else + movu m0, [leftq] + %endif + %if %2 == 4 + punpcklbw m0, m7 + %else + punpckhbw m1, m0, m7 + punpcklbw m0, m7 + movhlps m3, m1 + movd [px+32*4-4], m1 + movd [px+32*6-4], m3 + psrlq m1, 32 + psrlq m3, 32 + movd [px+32*5-4], m1 + movd [px+32*7-4], m3 + %endif + movhlps m2, m0 + movd [px+32*0-4], m0 + movd [px+32*2-4], m2 + psrlq m0, 32 + psrlq m2, 32 + movd [px+32*1-4], m0 + movd [px+32*3-4], m2 + jmp .left_done +.no_left: + movd [px+32*0-4], m6 + movd [px+32*1-4], m6 + movd [px+32*2-4], m6 + movd [px+32*3-4], m6 + %if %2 == 8 + movd [px+32*4-4], m6 + movd [px+32*5-4], m6 + movd [px+32*6-4], m6 + movd [px+32*7-4], m6 + %endif +.left_done: + + ; bottom + movifnidn botq, r4mp + test edgeb, 8 ; have_bottom + jz .no_bottom + test edgeb, 1 ; have_left + jz .bottom_no_left + test edgeb, 2 ; have_right + jz .bottom_no_right + %if %1 == 4 + PMOVZXBW m0, [botq+strideq*0-(%1/2)] + PMOVZXBW m1, [botq+strideq*1-(%1/2)] + %else + movu m0, [botq+strideq*0-4] + movu m1, [botq+strideq*1-4] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + movu [px+32*(%2+0)+8], m2 + movu [px+32*(%2+1)+8], m3 + %endif + movu [px+32*(%2+0)-%1], m0 + movu [px+32*(%2+1)-%1], m1 + jmp .bottom_done +.bottom_no_right: + %if %1 == 4 + PMOVZXBW m0, [botq+strideq*0-4] + PMOVZXBW m1, [botq+strideq*1-4] + movu [px+32*(%2+0)-8], m0 + movu [px+32*(%2+1)-8], m1 + %else + movu m0, [botq+strideq*0-8] + movu m1, [botq+strideq*1-8] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px+32*(%2+0)-16], m0 + mova [px+32*(%2+0)+ 0], m2 + mova [px+32*(%2+1)-16], m1 + mova [px+32*(%2+1)+ 0], m3 + movd [px+32*(%2-1)+16], m6 ; overwritten by first mova + %endif + movd [px+32*(%2+0)+%1*2], m6 + movd [px+32*(%2+1)+%1*2], m6 + jmp .bottom_done +.bottom_no_left: + test edgeb, 2 ; have_right + jz .bottom_no_left_right + %if %1 == 4 + PMOVZXBW m0, [botq+strideq*0] + PMOVZXBW m1, [botq+strideq*1] + %else + movu m0, [botq+strideq*0] + movu m1, [botq+strideq*1] + punpckhbw m2, m0, m7 + punpcklbw m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m1, m7 + mova [px+32*(%2+0)+16], m2 + mova [px+32*(%2+1)+16], m3 + %endif + mova [px+32*(%2+0)+ 0], m0 + mova [px+32*(%2+1)+ 0], m1 + movd [px+32*(%2+0)- 4], m6 + movd [px+32*(%2+1)- 4], m6 + jmp .bottom_done +.bottom_no_left_right: + PMOVZXBW m0, [botq+strideq*0], %1 == 4 + PMOVZXBW m1, [botq+strideq*1], %1 == 4 + mova [px+32*(%2+0)+ 0], m0 + mova [px+32*(%2+1)+ 0], m1 + movd [px+32*(%2+0)+%1*2], m6 + movd [px+32*(%2+1)+%1*2], m6 + movd [px+32*(%2+0)- 4], m6 + movd [px+32*(%2+1)- 4], m6 + jmp .bottom_done +.no_bottom: + movu [px+32*(%2+0)- 4], m6 + movu [px+32*(%2+1)- 4], m6 + %if %1 == 8 + movq [px+32*(%2+0)+12], m6 + movq [px+32*(%2+1)+12], m6 + %endif +.bottom_done: + + ; actual filter + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec + mova m13, [shufb_lohi] + %if cpuflag(ssse3) + mova m15, [pw_2048] + %else + mova m15, [pw_8] + %endif + mova m14, m6 + %else + DEFINE_ARGS dst, pridmp, sec, damping, pri, tap + %xdefine m8 m1 + %xdefine m9 m2 + %xdefine m10 m0 + %xdefine m13 [base+shufb_lohi] + %xdefine m14 OUT_OF_BOUNDS_MEM + %if cpuflag(ssse3) + %xdefine m15 [base+pw_2048] + %else + %xdefine m15 [base+pw_8] + %endif + %endif + movifnidn prid, r5m + movifnidn secd, r6m + mov dampingd, r8m + movif32 [esp+0x3C], r1d + test prid, prid + jz .sec_only + movd m1, r5m + bsr pridmpd, prid + test secd, secd + jz .pri_only + movd m10, r6m + tzcnt secd, secd + and prid, 1 + sub pridmpd, dampingd + sub secd, dampingd + xor dampingd, dampingd + add prid, prid + neg pridmpd + cmovs pridmpd, dampingd + neg secd + PSHUFB_0 m1, m7 + PSHUFB_0 m10, m7 + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec + lea tapq, [tap_table] + MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask + MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask + mov [rsp+0x00], pridmpq ; pri_shift + mov [rsp+0x10], secq ; sec_shift + DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off + %else + MOVDDUP m2, [tapq+pridmpq*8] + MOVDDUP m3, [tapq+secq*8] + mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw + mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP + mov [esp+0x00], pridmpd + mov [esp+0x30], secd + DEFINE_ARGS dst, stride, dir, stk, pri, tap, h + %define offq dstq + %define kd strided + %define kq strideq + mova [esp+0x10], m2 + mova [esp+0x40], m3 + mova [esp+0x20], m1 + mova [esp+0x50], m10 + %endif + mov dird, r7m + lea stkq, [px] + lea priq, [tapq+8*8+priq*8] ; pri_taps + mov hd, %1*%2/8 + lea dirq, [tapq+dirq*2] +.v_loop: + movif32 [esp+0x38], dstd + mov kd, 1 + %if %1 == 4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] + %else + mova m4, [stkq+32*0] ; px + %endif + pxor m0, m0 ; sum + mova m7, m4 ; max + mova m8, m4 ; min +.k_loop: + MOVDDUP m2, [priq+kq*8] + %if ARCH_X86_64 + ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1 + ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1 + %else + ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 + %endif + dec kd + jge .k_loop + movif32 dstq, [esp+0x38] + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 1 + dec hd + jg .v_loop + RET + +.pri_only: +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap + lea tapq, [tap_table] + %else + DEFINE_ARGS dst, pridmp, zero, damping, pri, tap + %endif + and prid, 1 + xor zerod, zerod + sub dampingd, pridmpd + cmovs dampingd, zerod + add prid, prid + PSHUFB_0 m1, m7 + MOVDDUP m7, [tapq+dampingq*8] + mov [rsp+0x00], dampingq + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off + %else + mov [rsp+0x04], zerod + DEFINE_ARGS dst, stride, dir, stk, pri, tap, h + %endif + mov dird, r7m + lea stkq, [px] + lea priq, [tapq+8*8+priq*8] + mov hd, %1*%2/8 + lea dirq, [tapq+dirq*2] +.pri_v_loop: + movif32 [esp+0x38], dstd + mov kd, 1 + %if %1 == 4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] + %else + mova m4, [stkq+32*0] + %endif + pxor m0, m0 +.pri_k_loop: + MOVDDUP m2, [priq+kq*8] + ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0 + dec kd + jge .pri_k_loop + movif32 dstq, [esp+0x38] + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 0 + dec hd + jg .pri_v_loop + RET + +.sec_only: +%if ARCH_X86_64 + DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec +%else + DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero +%endif + movd m1, r6m + tzcnt secd, secd + mov dird, r7m + xor zerod, zerod + sub dampingd, secd + cmovs dampingd, zerod + PSHUFB_0 m1, m7 + %if ARCH_X86_64 + lea tapq, [tap_table] + %else + mov [rsp+0x04], zerod + %endif + mov [rsp+0x00], dampingq + MOVDDUP m7, [tapq+dampingq*8] + lea dirq, [tapq+dirq*2] + %if ARCH_X86_64 + DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k + %else + DEFINE_ARGS dst, stride, off, stk, dir, tap, h + %endif + lea stkq, [px] + mov hd, %1*%2/8 +.sec_v_loop: + mov kd, 1 + %if %1 == 4 + movq m4, [stkq+32*0] + movhps m4, [stkq+32*1] + %else + mova m4, [stkq+32*0] + %endif + pxor m0, m0 +.sec_k_loop: + MOVDDUP m2, [tapq+12*8+kq*8] + ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0 + %if ARCH_X86_32 + MOVDDUP m2, [tapq+12*8+kq*8] + %endif + ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0 + dec kd + jge .sec_k_loop + movif32 strideq, [esp+0x3C] + CDEF_FILTER_END %1, 0 + dec hd + jg .sec_v_loop + RET +%endmacro + +%macro MULLD 2 + %if cpuflag(sse4) + pmulld %1, %2 + %else + %if ARCH_X86_32 + %define m15 m1 + %endif + pmulhuw m15, %1, %2 + pmullw %1, %2 + pslld m15, 16 + paddd %1, m15 + %endif +%endmacro + +%macro CDEF_DIR 0 + %if ARCH_X86_64 +cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var + lea r6, [strideq*3] + movq m1, [srcq+strideq*0] + movhps m1, [srcq+strideq*1] + movq m3, [srcq+strideq*2] + movhps m3, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+r6 ] + + pxor m8, m8 + psadbw m9, m1, m8 + psadbw m2, m3, m8 + psadbw m4, m5, m8 + psadbw m6, m7, m8 + packssdw m9, m2 + packssdw m4, m6 + packssdw m9, m4 + + punpcklbw m0, m1, m8 + punpckhbw m1, m8 + punpcklbw m2, m3, m8 + punpckhbw m3, m8 + punpcklbw m4, m5, m8 + punpckhbw m5, m8 + punpcklbw m6, m7, m8 + punpckhbw m7, m8 +cglobal_label .main + mova m8, [pw_128] + psubw m0, m8 + psubw m1, m8 + psubw m2, m8 + psubw m3, m8 + psubw m4, m8 + psubw m5, m8 + psubw m6, m8 + psubw m7, m8 + psllw m8, 3 + psubw m9, m8 ; partial_sum_hv[0] + + paddw m8, m0, m1 + paddw m10, m2, m3 + paddw m8, m4 + paddw m10, m5 + paddw m8, m6 + paddw m10, m7 + paddw m8, m10 ; partial_sum_hv[1] + + pmaddwd m8, m8 + pmaddwd m9, m9 + phaddd m9, m8 + SWAP m8, m9 + MULLD m8, [div_table%+SUFFIX+48] + + pslldq m9, m1, 2 + psrldq m10, m1, 14 + pslldq m11, m2, 4 + psrldq m12, m2, 12 + pslldq m13, m3, 6 + psrldq m14, m3, 10 + paddw m9, m0 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 ; partial_sum_diag[0] top/right half + paddw m9, m11 ; partial_sum_diag[0] top/left half + pslldq m11, m4, 8 + psrldq m12, m4, 8 + pslldq m13, m5, 10 + psrldq m14, m5, 6 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 + paddw m10, m14 + pslldq m11, m6, 12 + psrldq m12, m6, 4 + pslldq m13, m7, 14 + psrldq m14, m7, 2 + paddw m9, m11 + paddw m10, m12 + paddw m9, m13 ; partial_sum_diag[0][0-7] + paddw m10, m14 ; partial_sum_diag[0][8-14,zero] + pshufb m10, [shufw_6543210x] + punpckhwd m11, m9, m10 + punpcklwd m9, m10 + pmaddwd m11, m11 + pmaddwd m9, m9 + MULLD m11, [div_table%+SUFFIX+16] + MULLD m9, [div_table%+SUFFIX+0] + paddd m9, m11 ; cost[0a-d] + + pslldq m10, m0, 14 + psrldq m11, m0, 2 + pslldq m12, m1, 12 + psrldq m13, m1, 4 + pslldq m14, m2, 10 + psrldq m15, m2, 6 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 + pslldq m12, m3, 8 + psrldq m13, m3, 8 + pslldq m14, m4, 6 + psrldq m15, m4, 10 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 + pslldq m12, m5, 4 + psrldq m13, m5, 12 + pslldq m14, m6, 2 + psrldq m15, m6, 14 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m15 ; partial_sum_diag[1][8-14,zero] + paddw m10, m7 ; partial_sum_diag[1][0-7] + pshufb m11, [shufw_6543210x] + punpckhwd m12, m10, m11 + punpcklwd m10, m11 + pmaddwd m12, m12 + pmaddwd m10, m10 + MULLD m12, [div_table%+SUFFIX+16] + MULLD m10, [div_table%+SUFFIX+0] + paddd m10, m12 ; cost[4a-d] + phaddd m9, m10 ; cost[0a/b,4a/b] + + paddw m10, m0, m1 + paddw m11, m2, m3 + paddw m12, m4, m5 + paddw m13, m6, m7 + phaddw m0, m4 + phaddw m1, m5 + phaddw m2, m6 + phaddw m3, m7 + + ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) + pslldq m4, m11, 2 + psrldq m5, m11, 14 + pslldq m6, m12, 4 + psrldq m7, m12, 12 + pslldq m14, m13, 6 + psrldq m15, m13, 10 + paddw m4, m10 + paddw m5, m7 + paddw m4, m6 + paddw m5, m15 ; partial_sum_alt[3] right + paddw m4, m14 ; partial_sum_alt[3] left + pshuflw m6, m5, q3012 + punpckhwd m5, m4 + punpcklwd m4, m6 + pmaddwd m5, m5 + pmaddwd m4, m4 + MULLD m5, [div_table%+SUFFIX+48] + MULLD m4, [div_table%+SUFFIX+32] + paddd m4, m5 ; cost[7a-d] + + pslldq m5, m10, 6 + psrldq m6, m10, 10 + pslldq m7, m11, 4 + psrldq m10, m11, 12 + pslldq m11, m12, 2 + psrldq m12, 14 + paddw m5, m7 + paddw m6, m10 + paddw m5, m11 + paddw m6, m12 + paddw m5, m13 + pshuflw m7, m6, q3012 + punpckhwd m6, m5 + punpcklwd m5, m7 + pmaddwd m6, m6 + pmaddwd m5, m5 + MULLD m6, [div_table%+SUFFIX+48] + MULLD m5, [div_table%+SUFFIX+32] + paddd m5, m6 ; cost[5a-d] + + pslldq m6, m1, 2 + psrldq m7, m1, 14 + pslldq m10, m2, 4 + psrldq m11, m2, 12 + pslldq m12, m3, 6 + psrldq m13, m3, 10 + paddw m6, m0 + paddw m7, m11 + paddw m6, m10 + paddw m7, m13 ; partial_sum_alt[3] right + paddw m6, m12 ; partial_sum_alt[3] left + pshuflw m10, m7, q3012 + punpckhwd m7, m6 + punpcklwd m6, m10 + pmaddwd m7, m7 + pmaddwd m6, m6 + MULLD m7, [div_table%+SUFFIX+48] + MULLD m6, [div_table%+SUFFIX+32] + paddd m6, m7 ; cost[1a-d] + + pshufd m0, m0, q1032 + pshufd m1, m1, q1032 + pshufd m2, m2, q1032 + pshufd m3, m3, q1032 + + pslldq m10, m0, 6 + psrldq m11, m0, 10 + pslldq m12, m1, 4 + psrldq m13, m1, 12 + pslldq m14, m2, 2 + psrldq m2, 14 + paddw m10, m12 + paddw m11, m13 + paddw m10, m14 + paddw m11, m2 + paddw m10, m3 + pshuflw m12, m11, q3012 + punpckhwd m11, m10 + punpcklwd m10, m12 + pmaddwd m11, m11 + pmaddwd m10, m10 + MULLD m11, [div_table%+SUFFIX+48] + MULLD m10, [div_table%+SUFFIX+32] + paddd m10, m11 ; cost[3a-d] + + phaddd m9, m8 ; cost[0,4,2,6] + phaddd m6, m10 + phaddd m5, m4 + phaddd m6, m5 ; cost[1,3,5,7] + pshufd m4, m9, q3120 + + ; now find the best cost + %if cpuflag(sse4) + pmaxsd m9, m6 + pshufd m0, m9, q1032 + pmaxsd m0, m9 + pshufd m1, m0, q2301 + pmaxsd m0, m1 ; best cost + %else + pcmpgtd m0, m9, m6 + pand m9, m0 + pandn m0, m6 + por m9, m0 + pshufd m1, m9, q1032 + pcmpgtd m0, m9, m1 + pand m9, m0 + pandn m0, m1 + por m9, m0 + pshufd m1, m9, q2301 + pcmpgtd m0, m9, m1 + pand m9, m0 + pandn m0, m1 + por m0, m9 + %endif + + ; get direction and variance + punpckhdq m1, m4, m6 + punpckldq m4, m6 + psubd m2, m0, m1 + psubd m3, m0, m4 +%if WIN64 + WIN64_RESTORE_XMM + %define tmp rsp+stack_offset+8 +%else + %define tmp rsp-40 +%endif + mova [tmp+0x00], m2 ; emulate ymm in stack + mova [tmp+0x10], m3 + pcmpeqd m1, m0 ; compute best cost mask + pcmpeqd m4, m0 + packssdw m4, m1 + pmovmskb eax, m4 ; get byte-idx from mask + tzcnt eax, eax + mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm + shr eax, 1 ; get direction by converting byte-idx to word-idx + shr r1d, 10 + mov [varq], r1d + %else +cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 +%define base r2-shufw_6543210x + LEA r2, shufw_6543210x + pxor m0, m0 + lea stride3q, [strideq*3] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+stride3q] + mova m1, [base+pw_128] + psadbw m2, m5, m0 + psadbw m3, m7, m0 + packssdw m2, m3 + punpcklbw m4, m5, m0 + punpckhbw m5, m0 + punpcklbw m6, m7, m0 + punpckhbw m7, m0 + psubw m4, m1 + psubw m5, m1 + psubw m6, m1 + psubw m7, m1 + + mova [esp+0x00], m4 + mova [esp+0x10], m5 + mova [esp+0x20], m6 + mova [esp+0x50], m7 + + lea srcq, [srcq+strideq*4] + movq m5, [srcq+strideq*0] + movhps m5, [srcq+strideq*1] + movq m7, [srcq+strideq*2] + movhps m7, [srcq+stride3q] + psadbw m3, m5, m0 + psadbw m0, m7 + packssdw m3, m0 + pxor m0, m0 + punpcklbw m4, m5, m0 + punpckhbw m5, m0 + punpcklbw m6, m7, m0 + punpckhbw m7, m0 +cglobal_label .main + psubw m4, m1 + psubw m5, m1 + psubw m6, m1 + psubw m7, m1 + packssdw m2, m3 + psllw m1, 3 + psubw m2, m1 ; partial_sum_hv[0] + pmaddwd m2, m2 + + mova m3, [esp+0x50] + mova m0, [esp+0x00] + paddw m0, [esp+0x10] + paddw m1, m3, [esp+0x20] + paddw m0, m4 + paddw m1, m5 + paddw m0, m6 + paddw m1, m7 + paddw m0, m1 ; partial_sum_hv[1] + pmaddwd m0, m0 + + phaddd m2, m0 + MULLD m2, [base+div_table%+SUFFIX+48] + mova [esp+0x30], m2 + + mova m1, [esp+0x10] + pslldq m0, m1, 2 + psrldq m1, 14 + paddw m0, [esp+0x00] + pslldq m2, m3, 6 + psrldq m3, 10 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x20] + pslldq m2, m3, 4 + psrldq m3, 12 + paddw m0, m2 ; partial_sum_diag[0] top/left half + paddw m1, m3 ; partial_sum_diag[0] top/right half + pslldq m2, m4, 8 + psrldq m3, m4, 8 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m5, 10 + psrldq m3, m5, 6 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m6, 12 + psrldq m3, m6, 4 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m7, 14 + psrldq m3, m7, 2 + paddw m0, m2 ; partial_sum_diag[0][0-7] + paddw m1, m3 ; partial_sum_diag[0][8-14,zero] + mova m3, [esp+0x50] + pshufb m1, [base+shufw_6543210x] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [base+div_table%+SUFFIX+16] + MULLD m0, [base+div_table%+SUFFIX+ 0] + paddd m0, m2 ; cost[0a-d] + mova [esp+0x40], m0 + + mova m1, [esp+0x00] + pslldq m0, m1, 14 + psrldq m1, 2 + paddw m0, m7 + pslldq m2, m3, 8 + psrldq m3, 8 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x20] + pslldq m2, m3, 10 + psrldq m3, 6 + paddw m0, m2 + paddw m1, m3 + mova m3, [esp+0x10] + pslldq m2, m3, 12 + psrldq m3, 4 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m4, 6 + psrldq m3, m4, 10 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m5, 4 + psrldq m3, m5, 12 + paddw m0, m2 + paddw m1, m3 + pslldq m2, m6, 2 + psrldq m3, m6, 14 + paddw m0, m2 ; partial_sum_diag[1][0-7] + paddw m1, m3 ; partial_sum_diag[1][8-14,zero] + mova m3, [esp+0x50] + pshufb m1, [base+shufw_6543210x] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [base+div_table%+SUFFIX+16] + MULLD m0, [base+div_table%+SUFFIX+ 0] + paddd m0, m2 ; cost[4a-d] + phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] + phaddd m1, [esp+0x30] ; cost[0,4,2,6] + mova [esp+0x30], m1 + + phaddw m0, [esp+0x00], m4 + phaddw m1, [esp+0x10], m5 + paddw m4, m5 + mova m2, [esp+0x20] + paddw m5, m2, m3 + phaddw m2, m6 + paddw m6, m7 + phaddw m3, m7 + mova m7, [esp+0x00] + paddw m7, [esp+0x10] + mova [esp+0x00], m0 + mova [esp+0x10], m1 + mova [esp+0x20], m2 + + pslldq m1, m4, 4 + pslldq m2, m6, 6 + pslldq m0, m5, 2 + paddw m1, m2 + paddw m0, m7 + psrldq m2, m5, 14 + paddw m0, m1 ; partial_sum_alt[3] left + psrldq m1, m4, 12 + paddw m1, m2 + psrldq m2, m6, 10 + paddw m1, m2 ; partial_sum_alt[3] right + pshuflw m1, m1, q3012 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmaddwd m2, m2 + pmaddwd m0, m0 + MULLD m2, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] + paddd m0, m2 ; cost[7a-d] + mova [esp+0x40], m0 + + pslldq m0, m7, 6 + psrldq m7, 10 + pslldq m1, m5, 4 + psrldq m5, 12 + pslldq m2, m4, 2 + psrldq m4, 14 + paddw m0, m6 + paddw m7, m5 + paddw m0, m1 + paddw m7, m4 + paddw m0, m2 + pshuflw m2, m7, q3012 + punpckhwd m7, m0 + punpcklwd m0, m2 + pmaddwd m7, m7 + pmaddwd m0, m0 + MULLD m7, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] + paddd m0, m7 ; cost[5a-d] + mova [esp+0x50], m0 + + mova m7, [esp+0x10] + mova m2, [esp+0x20] + pslldq m0, m7, 2 + psrldq m7, 14 + pslldq m4, m2, 4 + psrldq m2, 12 + pslldq m5, m3, 6 + psrldq m6, m3, 10 + paddw m0, [esp+0x00] + paddw m7, m2 + paddw m4, m5 + paddw m7, m6 ; partial_sum_alt[3] right + paddw m0, m4 ; partial_sum_alt[3] left + pshuflw m2, m7, q3012 + punpckhwd m7, m0 + punpcklwd m0, m2 + pmaddwd m7, m7 + pmaddwd m0, m0 + MULLD m7, [base+div_table%+SUFFIX+48] + MULLD m0, [base+div_table%+SUFFIX+32] + paddd m0, m7 ; cost[1a-d] + SWAP m0, m4 + + pshufd m0, [esp+0x00], q1032 + pshufd m1, [esp+0x10], q1032 + pshufd m2, [esp+0x20], q1032 + pshufd m3, m3, q1032 + mova [esp+0x00], m4 + + pslldq m4, m0, 6 + psrldq m0, 10 + pslldq m5, m1, 4 + psrldq m1, 12 + pslldq m6, m2, 2 + psrldq m2, 14 + paddw m4, m3 + paddw m0, m1 + paddw m5, m6 + paddw m0, m2 + paddw m4, m5 + pshuflw m2, m0, q3012 + punpckhwd m0, m4 + punpcklwd m4, m2 + pmaddwd m0, m0 + pmaddwd m4, m4 + MULLD m0, [base+div_table%+SUFFIX+48] + MULLD m4, [base+div_table%+SUFFIX+32] + paddd m4, m0 ; cost[3a-d] + + mova m1, [esp+0x00] + mova m2, [esp+0x50] + mova m0, [esp+0x30] ; cost[0,4,2,6] + phaddd m1, m4 + phaddd m2, [esp+0x40] ; cost[1,3,5,7] + phaddd m1, m2 + pshufd m2, m0, q3120 + + ; now find the best cost + %if cpuflag(sse4) + pmaxsd m0, m1 + pshufd m3, m0, q1032 + pmaxsd m3, m0 + pshufd m0, m3, q2301 + pmaxsd m0, m3 + %else + pcmpgtd m3, m0, m1 + pand m0, m3 + pandn m3, m1 + por m0, m3 + pshufd m4, m0, q1032 + pcmpgtd m3, m0, m4 + pand m0, m3 + pandn m3, m4 + por m0, m3 + pshufd m4, m0, q2301 + pcmpgtd m3, m0, m4 + pand m0, m3 + pandn m3, m4 + por m0, m3 + %endif + + ; get direction and variance + mov vard, varm + punpckhdq m3, m2, m1 + punpckldq m2, m1 + psubd m1, m0, m3 + psubd m4, m0, m2 + mova [esp+0x00], m1 ; emulate ymm in stack + mova [esp+0x10], m4 + pcmpeqd m3, m0 ; compute best cost mask + pcmpeqd m2, m0 + packssdw m2, m3 + pmovmskb eax, m2 ; get byte-idx from mask + tzcnt eax, eax + mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm + shr eax, 1 ; get direction by converting byte-idx to word-idx + shr r1d, 10 + mov [vard], r1d + %endif + + RET +%endmacro + +INIT_XMM sse4 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 +CDEF_DIR + +INIT_XMM ssse3 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 +CDEF_DIR + +INIT_XMM sse2 +CDEF_FILTER 8, 8 +CDEF_FILTER 4, 8 +CDEF_FILTER 4, 4 diff --git a/third_party/dav1d/src/x86/cpu.c b/third_party/dav1d/src/x86/cpu.c new file mode 100644 index 0000000000..764d8be8ef --- /dev/null +++ b/third_party/dav1d/src/x86/cpu.c @@ -0,0 +1,100 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include +#include + +#include "common/attributes.h" + +#include "src/x86/cpu.h" + +typedef struct { + uint32_t eax, ebx, edx, ecx; +} CpuidRegisters; + +void dav1d_cpu_cpuid(CpuidRegisters *regs, unsigned leaf, unsigned subleaf); +uint64_t dav1d_cpu_xgetbv(unsigned xcr); + +#define X(reg, mask) (((reg) & (mask)) == (mask)) + +COLD unsigned dav1d_get_cpu_flags_x86(void) { + union { + CpuidRegisters r; + struct { + uint32_t max_leaf; + char vendor[12]; + }; + } cpu; + dav1d_cpu_cpuid(&cpu.r, 0, 0); + unsigned flags = 0; + + if (cpu.max_leaf >= 1) { + CpuidRegisters r; + dav1d_cpu_cpuid(&r, 1, 0); + const unsigned model = ((r.eax >> 4) & 0x0f) + ((r.eax >> 12) & 0xf0); + const unsigned family = ((r.eax >> 8) & 0x0f) + ((r.eax >> 20) & 0xff); + + if (X(r.edx, 0x06008000)) /* CMOV/SSE/SSE2 */ { + flags |= DAV1D_X86_CPU_FLAG_SSE2; + if (X(r.ecx, 0x00000201)) /* SSE3/SSSE3 */ { + flags |= DAV1D_X86_CPU_FLAG_SSSE3; + if (X(r.ecx, 0x00080000)) /* SSE4.1 */ + flags |= DAV1D_X86_CPU_FLAG_SSE41; + } + } +#if ARCH_X86_64 + /* We only support >128-bit SIMD on x86-64. */ + if (X(r.ecx, 0x18000000)) /* OSXSAVE/AVX */ { + const uint64_t xcr0 = dav1d_cpu_xgetbv(0); + if (X(xcr0, 0x00000006)) /* XMM/YMM */ { + if (cpu.max_leaf >= 7) { + dav1d_cpu_cpuid(&r, 7, 0); + if (X(r.ebx, 0x00000128)) /* BMI1/BMI2/AVX2 */ { + flags |= DAV1D_X86_CPU_FLAG_AVX2; + if (X(xcr0, 0x000000e0)) /* ZMM/OPMASK */ { + if (X(r.ebx, 0xd0230000) && X(r.ecx, 0x00005f42)) + flags |= DAV1D_X86_CPU_FLAG_AVX512ICL; + } + } + } + } + } +#endif + if (!memcmp(cpu.vendor, "AuthenticAMD", sizeof(cpu.vendor))) { + if ((flags & DAV1D_X86_CPU_FLAG_AVX2) && (family < 0x19 || + (family == 0x19 && (model < 0x10 || (model >= 0x20 && model < 0x60))))) + { + /* Excavator, Zen, Zen+, Zen 2, Zen 3, Zen 3+ */ + flags |= DAV1D_X86_CPU_FLAG_SLOW_GATHER; + } + } + } + + return flags; +} diff --git a/third_party/dav1d/src/x86/cpu.h b/third_party/dav1d/src/x86/cpu.h new file mode 100644 index 0000000000..8529c77c9b --- /dev/null +++ b/third_party/dav1d/src/x86/cpu.h @@ -0,0 +1,44 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_X86_CPU_H +#define DAV1D_SRC_X86_CPU_H + +enum CpuFlags { + DAV1D_X86_CPU_FLAG_SSE2 = 1 << 0, + DAV1D_X86_CPU_FLAG_SSSE3 = 1 << 1, + DAV1D_X86_CPU_FLAG_SSE41 = 1 << 2, + DAV1D_X86_CPU_FLAG_AVX2 = 1 << 3, + DAV1D_X86_CPU_FLAG_AVX512ICL = 1 << 4, /* F/CD/BW/DQ/VL/VNNI/IFMA/VBMI/VBMI2/ + * VPOPCNTDQ/BITALG/GFNI/VAES/VPCLMULQDQ */ + DAV1D_X86_CPU_FLAG_SLOW_GATHER = 1 << 5, /* Flag CPUs where gather instructions are slow enough + * to cause performance regressions. */ +}; + +unsigned dav1d_get_cpu_flags_x86(void); + +#endif /* DAV1D_SRC_X86_CPU_H */ diff --git a/third_party/dav1d/src/x86/cpuid.asm b/third_party/dav1d/src/x86/cpuid.asm new file mode 100644 index 0000000000..e1d9228660 --- /dev/null +++ b/third_party/dav1d/src/x86/cpuid.asm @@ -0,0 +1,55 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION .text + +cglobal cpu_cpuid, 0, 5, 0, regs, leaf, subleaf + mov r4, regsmp + mov eax, leafm + mov ecx, subleafm +%if ARCH_X86_64 + mov r5, rbx +%endif + cpuid + mov [r4+4*0], eax + mov [r4+4*1], ebx + mov [r4+4*2], edx + mov [r4+4*3], ecx +%if ARCH_X86_64 + mov rbx, r5 +%endif + RET + +cglobal cpu_xgetbv, 0, 0, 0, xcr + movifnidn ecx, xcrm + xgetbv +%if ARCH_X86_64 + shl rdx, 32 + or rax, rdx +%endif + RET diff --git a/third_party/dav1d/src/x86/filmgrain.h b/third_party/dav1d/src/x86/filmgrain.h new file mode 100644 index 0000000000..eeaa328d1e --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain.h @@ -0,0 +1,81 @@ +/* + * Copyright © 2018-2022, VideoLAN and dav1d authors + * Copyright © 2018-2022, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/filmgrain.h" + +#define decl_fg_fns(ext) \ +decl_generate_grain_y_fn(BF(dav1d_generate_grain_y, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_420, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_422, ext)); \ +decl_generate_grain_uv_fn(BF(dav1d_generate_grain_uv_444, ext)); \ +decl_fgy_32x32xn_fn(BF(dav1d_fgy_32x32xn, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i420, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i422, ext)); \ +decl_fguv_32x32xn_fn(BF(dav1d_fguv_32x32xn_i444, ext)) + +decl_fg_fns(ssse3); +decl_fg_fns(avx2); +decl_fg_fns(avx512icl); + +static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, ssse3); + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, ssse3); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, ssse3); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, ssse3); + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->generate_grain_y = BF(dav1d_generate_grain_y, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, avx2); + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) { + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx2); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2); + } + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_fguv_32x32xn_i420, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl); + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl); +#endif +} diff --git a/third_party/dav1d/src/x86/filmgrain16_avx2.asm b/third_party/dav1d/src/x86/filmgrain16_avx2.asm new file mode 100644 index 0000000000..a1d4c41f27 --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain16_avx2.asm @@ -0,0 +1,2248 @@ +; Copyright © 2021-2022, VideoLAN and dav1d authors +; Copyright © 2021-2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 +pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 +gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_27_17_17_27: dw 27, 17, 17, 27 +pw_23_22: dw 23, 22, 0, 32 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +gen_ar0_shift: times 4 db 128 + times 4 db 64 + times 4 db 32 + times 4 db 16 +pd_16: dd 16 +pd_m65536: dd -65536 +pb_1: times 4 db 1 +grain_max: times 2 dw 511 + times 2 dw 2047 +grain_min: times 2 dw -512 + times 2 dw -2048 +fg_max: times 2 dw 1023 + times 2 dw 4095 + times 2 dw 960 + times 2 dw 3840 + times 2 dw 940 + times 2 dw 3760 +fg_min: times 2 dw 0 + times 2 dw 64 + times 2 dw 256 +uv_offset_mul: dd 256 + dd 1024 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16, 8 +round_vals: dw 32, 64, 128, 256, 512, 1024 +pb_8_9_0_1: db 8, 9, 0, 1 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 + +SECTION .text + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +INIT_YMM avx2 +cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax +%define base r4-generate_grain_y_16bpc_avx2_table + lea r4, [generate_grain_y_16bpc_avx2_table] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + mov r6d, [fg_dataq+FGData.grain_scale_shift] + movq xm1, [base+next_upperbit_mask] + mov r3, -73*82*2 + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + lea r7d, [bdmaxq+1] + movq xm4, [base+mul_bits] + shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc + movq xm5, [base+hmul_bits] + sub r6, r7 + mova xm6, [base+pb_mask] + sub bufq, r3 + vpbroadcastw xm7, [base+round+r6*2-2] + lea r6, [gaussian_sequence] + movsxd r5, [r4+r5*4] +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pmulhuw xm0, xm5 + pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm2, xm0 ; aggregate each bit into next seed's high bit + por xm3, xm2 ; 4 next output seeds + pshuflw xm0, xm3, q3333 + psrlw xm3, 5 + pand xm2, xm0, xm1 + movq r7, xm3 + psrlw xm3, xm2, 10 + por xm2, xm3 + pmullw xm2, xm4 + pmulhuw xm0, xm5 + movzx r8d, r7w + pshufb xm3, xm6, xm2 + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm0, xm2 + movd xm2, [r6+r8*2] + rorx r8, r7, 32 + por xm3, xm0 + shr r7d, 16 + pinsrw xm2, [r6+r7*2], 1 + pshuflw xm0, xm3, q3333 + movzx r7d, r8w + psrlw xm3, 5 + pinsrw xm2, [r6+r7*2], 2 + shr r8d, 16 + movq r7, xm3 + pinsrw xm2, [r6+r8*2], 3 + movzx r8d, r7w + pinsrw xm2, [r6+r8*2], 4 + rorx r8, r7, 32 + shr r7d, 16 + pinsrw xm2, [r6+r7*2], 5 + movzx r7d, r8w + pinsrw xm2, [r6+r7*2], 6 + shr r8d, 16 + pinsrw xm2, [r6+r8*2], 7 + paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 + pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support + mova [bufq+r3], xm2 + add r3, 8*2 + jl .loop + + ; auto-regression code + add r5, r4 + jmp r5 + +.ar1: + DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_y] + DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 + pinsrb xm4, [base+pb_1], 3 + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -76 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu xm0, [bufq+xq*2-82*2-2] ; top/left + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right + punpcklwd xm0, xm2 + punpcklwd xm1, xm3 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xb, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 +.x_loop_ar1_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 + vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 + vpbroadcastw xm10, [base+round_vals-12+shiftq*2] + pxor m1, m1 + punpcklwd xm10, xm1 + pcmpgtb m1, m0 + punpcklbw m0, m1 ; cf5-11,0-4 + vpermq m1, m0, q3333 ; cf4 + vbroadcasti128 m11, [base+gen_shufA] + pshufd m6, m0, q0000 ; cf[5,6], cf[0-1] + vbroadcasti128 m12, [base+gen_shufB] + pshufd m7, m0, q1111 ; cf[7,8], cf[2-3] + punpckhwd xm1, xm0 + pshufhw xm9, xm0, q2121 + pshufd xm8, xm1, q0000 ; cf[4,9] + sar bdmaxd, 1 + punpckhqdq xm9, xm9 ; cf[10,11] + movd xm4, bdmaxd ; max_grain + pcmpeqd xm5, xm5 + sub bufq, 2*(82*73-(82*3+79)) + pxor xm5, xm4 ; min_grain + DEFINE_ARGS buf, fg_data, h, x + mov hd, 70 +.y_loop_ar2: + mov xq, -76 +.x_loop_ar2: + vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5] + pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + pmaddwd m0, m6 + punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5] + pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, m7 + pmaddwd xm2, xm8 + paddd m0, m1 + vextracti128 xm1, m0, 1 + paddd xm0, xm10 + paddd xm2, xm0 + movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] + paddd xm2, xm1 + pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3] +.x_loop_ar2_inner: + pmaddwd xm3, xm9, xm0 + psrldq xm0, 2 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; skip packssdw because we only care about one value + paddd xm3, xm1 + pminsd xm3, xm4 + psrldq xm1, 4 + pmaxsd xm3, xm5 + pextrw [bufq+xq*2], xm3, 0 + punpcklwd xm3, xm3 + pblendw xm0, xm3, 0010b + inc xq + jz .x_loop_ar2_end + test xb, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 +.x_loop_ar2_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + sar bdmaxd, 1 + movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 + movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 + pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1 + movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 + vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 + vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 + vpbroadcastw xm11, [base+round_vals+shiftq*2-12] + movd xm12, bdmaxd ; max_grain + punpcklbw m7, m7 ; sign-extension + punpcklbw m0, m0 ; sign-extension + punpcklbw xm1, xm1 + REPX {psraw x, 8}, m7, m0, xm1 + pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8] + pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10] + pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12] + pshufd xm7, xm7, q3333 ; cf[6,13] + pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18] + pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20] + paddw xm0, xm11, xm11 + pcmpeqd xm13, xm13 + pblendw xm10, xm1, xm0, 00001000b + pxor xm13, xm12 ; min_grain + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 +.x_loop_ar3: + movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] + movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] + vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] + palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5] + palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] + punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] + pmaddwd m0, m4 + pmaddwd m2, m6 + pmaddwd m3, m5 + paddd m0, m2 + movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] + paddd m0, m3 + psrldq m3, m2, 2 + punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] + paddd m0, m3 + psrldq m3, m2, 4 + psrldq m2, 6 + vpblendd m2, m11, 0x0f ; rounding constant + punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] + pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] + vextracti128 xm2, m1, 1 + punpcklwd xm1, xm2 + pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] + paddd m0, m3 + vextracti128 xm2, m0, 1 + paddd xm0, xm1 + movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] + paddd xm0, xm2 +.x_loop_ar3_inner: + pmaddwd xm2, xm1, xm10 + pshuflw xm3, xm2, q1032 + paddd xm2, xm0 ; add top + paddd xm2, xm3 ; left+cur + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; skip packssdw because we only care about one value + pminsd xm2, xm12 + pmaxsd xm2, xm13 + pextrw [bufq+xq*2], xm2, 0 + pslldq xm2, 4 + psrldq xm1, 2 + pblendw xm1, xm2, 0100b + inc xq + jz .x_loop_ar3_end + test xb, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 +.x_loop_ar3_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar3 + RET + +%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y +INIT_XMM avx2 +cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax +%define base r8-generate_grain_uv_%1_16bpc_avx2_table + lea r8, [generate_grain_uv_%1_16bpc_avx2_table] + movifnidn bdmaxd, bdmaxm + vpbroadcastw xm0, [fg_dataq+FGData.seed] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + movq xm1, [base+next_upperbit_mask] + lea r6d, [bdmaxq+1] + movq xm4, [base+mul_bits] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + movq xm5, [base+hmul_bits] + sub r5, r6 + mova xm6, [base+pb_mask] + vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] + vpbroadcastw xm7, [base+round+r5*2-2] + pxor xm0, xm2 + lea r6, [gaussian_sequence] +%if %2 + mov r7d, 73-35*%3 + add bufq, 44*2 +.loop_y: + mov r5, -44*2 +%else + mov r5, -82*73*2 + sub bufq, r5 +%endif +.loop_x: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pmulhuw xm0, xm5 + pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm2, xm0 ; aggregate each bit into next seed's high bit + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + movq r10, xm2 + movzx r9d, r10w + movd xm2, [r6+r9*2] + rorx r9, r10, 32 + shr r10d, 16 + pinsrw xm2, [r6+r10*2], 1 + movzx r10d, r9w + pinsrw xm2, [r6+r10*2], 2 + shr r9d, 16 + pinsrw xm2, [r6+r9*2], 3 + paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 + pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support + movq [bufq+r5], xm2 + add r5, 8 + jl .loop_x +%if %2 + add bufq, 82*2 + dec r7d + jg .loop_y +%endif + + ; auto-regression code + movsxd r6, [fg_dataq+FGData.ar_coeff_lag] + movsxd r6, [r8+r6*4] + add r6, r8 + jmp r6 + +INIT_YMM avx2 +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq] + sar bdmaxd, 1 + vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4] + movd xm6, bdmaxd + pcmpeqw m7, m7 + pmaddubsw m4, m0 ; ar_coeff << (14 - shift) + vpbroadcastw m6, xm6 ; max_gain + pxor m7, m6 ; min_grain + DEFINE_ARGS buf, bufy, h, x +%if %2 + vpbroadcastw m5, [base+hmul_bits+2+%3*2] + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif + add bufyq, 2*(3+82*3) + mov hd, 70-35*%3 +.y_loop_ar0: +%if %2 + ; first 32 pixels + movu xm0, [bufyq+16*0] + vinserti128 m0, [bufyq+16*2], 1 + movu xm1, [bufyq+16*1] + vinserti128 m1, [bufyq+16*3], 1 +%if %3 + movu xm2, [bufyq+82*2+16*0] + vinserti128 m2, [bufyq+82*2+16*2], 1 + movu xm3, [bufyq+82*2+16*1] + vinserti128 m3, [bufyq+82*2+16*3], 1 + paddw m0, m2 + paddw m1, m3 +%endif + phaddw m0, m1 + movu xm1, [bufyq+16*4] + vinserti128 m1, [bufyq+16*6], 1 + movu xm2, [bufyq+16*5] + vinserti128 m2, [bufyq+16*7], 1 +%if %3 + movu xm3, [bufyq+82*2+16*4] + vinserti128 m3, [bufyq+82*2+16*6], 1 + paddw m1, m3 + movu xm3, [bufyq+82*2+16*5] + vinserti128 m3, [bufyq+82*2+16*7], 1 + paddw m2, m3 +%endif + phaddw m1, m2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 +%else + xor xd, xd +.x_loop_ar0: + movu m0, [bufyq+xq*2] + movu m1, [bufyq+xq*2+32] +%endif + paddw m0, m0 + paddw m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 +%if %2 + paddw m0, [bufq+ 0] + paddw m1, [bufq+32] +%else + paddw m0, [bufq+xq*2+ 0] + paddw m1, [bufq+xq*2+32] +%endif + pminsw m0, m6 + pminsw m1, m6 + pmaxsw m0, m7 + pmaxsw m1, m7 +%if %2 + movu [bufq+ 0], m0 + movu [bufq+32], m1 + + ; last 6 pixels + movu xm0, [bufyq+32*4] + movu xm1, [bufyq+32*4+16] +%if %3 + paddw xm0, [bufyq+32*4+82*2] + paddw xm1, [bufyq+32*4+82*2+16] +%endif + phaddw xm0, xm1 + movu xm1, [bufq+32*2] + pmulhrsw xm0, xm5 + paddw xm0, xm0 + pmulhrsw xm0, xm4 + paddw xm0, xm1 + pminsw xm0, xm6 + pmaxsw xm0, xm7 + vpblendd xm0, xm1, 0x08 + movu [bufq+32*2], xm0 +%else + movu [bufq+xq*2+ 0], m0 + movu [bufq+xq*2+32], m1 + add xd, 32 + cmp xd, 64 + jl .x_loop_ar0 + + ; last 12 pixels + movu m0, [bufyq+64*2] + movu m1, [bufq+64*2] + paddw m0, m0 + pmulhrsw m0, m4 + paddw m0, m1 + pminsw m0, m6 + pmaxsw m0, m7 + vpblendd m0, m1, 0xc0 + movu [bufq+64*2], m0 +%endif + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar0 + RET + +INIT_XMM avx2 +.ar1: + DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 + DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd + vpbroadcastw xm6, [base+hmul_bits+2+%3*2] + vpbroadcastd xm3, xm3 +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu xm0, [bufq+xq*2-82*2-2] ; top/left +%if %2 + movu xm2, [bufyq+xq*4] +%else + movq xm2, [bufyq+xq*2] +%endif +%if %2 +%if %3 + phaddw xm2, [bufyq+xq*4+82*2] + punpckhqdq xm1, xm2, xm2 + paddw xm2, xm1 +%else + phaddw xm2, xm2 +%endif + pmulhrsw xm2, xm6 +%endif + psrldq xm1, xm0, 4 ; top/right + punpcklwd xm1, xm2 + psrldq xm2, xm0, 2 ; top + punpcklwd xm0, xm2 + pmaddwd xm1, xm5 + pmaddwd xm0, xm4 + paddd xm1, xm3 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xb, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 +.x_loop_ar1_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar1 + RET + +INIT_YMM avx2 +.ar2: +%if WIN64 + ; xmm6 and xmm7 already saved + %assign xmm_regs_used 13 + %2 + %assign stack_size_padded 136 + SUB rsp, stack_size_padded + movaps [rsp+16*2], xmm8 + movaps [rsp+16*3], xmm9 + movaps [rsp+16*4], xmm10 + movaps [rsp+16*5], xmm11 + movaps [rsp+16*6], xmm12 +%if %2 + movaps [rsp+16*7], xmm13 +%endif +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vbroadcasti128 m10, [base+gen_shufA] + sar bdmaxd, 1 + vbroadcasti128 m11, [base+gen_shufB] + movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5] + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 + pinsrb xm7, [base+pb_1], 5 + pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 + movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13 + pmovsxbw m7, xm7 + movd xm8, bdmaxd ; max_grain + pshufd m4, m7, q0000 + vpbroadcastw xm12, [base+round_vals-12+shiftq*2] + pshufd m5, m7, q1111 + pcmpeqd xm9, xm9 + pshufd m6, m7, q2222 + pxor xm9, xm8 ; min_grain + pshufd xm7, xm7, q3333 + DEFINE_ARGS buf, bufy, fg_data, h, x +%if %2 + vpbroadcastw xm13, [base+hmul_bits+2+%3*2] + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) +.x_loop_ar2: + vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] + pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + pmaddwd m0, m4 + pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, m5 + punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5] +%if %2 + movu xm3, [bufyq+xq*4] +%if %3 + paddw xm3, [bufyq+xq*4+82*2] +%endif + phaddw xm3, xm3 + pmulhrsw xm3, xm13 +%else + movq xm3, [bufyq+xq*2] +%endif + punpcklwd xm3, xm12 ; luma, round interleaved + vpblendd m2, m3, 0x0f + pmaddwd m2, m6 + paddd m1, m0 + movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] + paddd m2, m1 + vextracti128 xm1, m2, 1 + paddd xm2, xm1 + pshufd xm1, xm0, q3321 + pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword +.x_loop_ar2_inner: + pmaddwd xm3, xm7, xm0 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; we do not need to packssdw since we only care about one value + paddd xm3, xm1 + psrldq xm1, 4 + pminsd xm3, xm8 + pmaxsd xm3, xm9 + pextrw [bufq+xq*2], xm3, 0 + psrldq xm0, 2 + pslldq xm3, 2 + pblendw xm0, xm3, 00000010b + inc xq + jz .x_loop_ar2_end + test xb, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 +.x_loop_ar2_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: +%if WIN64 + ; xmm6 and xmm7 already saved + %assign stack_offset 32 + %assign xmm_regs_used 14 + %2 + %assign stack_size_padded 152 + SUB rsp, stack_size_padded + movaps [rsp+16*2], xmm8 + movaps [rsp+16*3], xmm9 + movaps [rsp+16*4], xmm10 + movaps [rsp+16*5], xmm11 + movaps [rsp+16*6], xmm12 + movaps [rsp+16*7], xmm13 +%if %2 + movaps [rsp+16*8], xmm14 +%endif +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm11, [base+round_vals-12+shiftq*2] + sar bdmaxd, 1 + movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma + movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] + pmovsxbw m7, xm7 +%if %2 + vpbroadcastw xm14, [base+hmul_bits+2+%3*2] +%endif + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] + pinsrb xm0, [base+pb_1], 3 + pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 + pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 + pmovsxbw m0, xm0 + movd xm12, bdmaxd ; max_grain + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pcmpeqd xm13, xm13 + punpckhqdq xm10, xm0, xm0 + pxor xm13, xm12 ; min_grain + pinsrw xm10, [base+round_vals-10+shiftq*2], 3 + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) +.x_loop_ar3: + movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] + movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] + vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] + palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5] + palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6] + punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] + punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] + shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] + pmaddwd m0, m4 + pmaddwd m2, m6 + pmaddwd m3, m5 + paddd m0, m2 + paddd m0, m3 + movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] +%if %2 + movu xm3, [bufyq+xq*4] +%if %3 + paddw xm3, [bufyq+xq*4+82*2] +%endif + phaddw xm3, xm3 + pmulhrsw xm3, xm14 +%else + movq xm3, [bufyq+xq*2] +%endif + punpcklwd m1, m3 + pmaddwd m1, m7 + paddd m0, m1 + psrldq m1, m2, 4 + psrldq m3, m2, 6 + vpblendd m3, m11, 0x0f ; rounding constant + punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] + pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] + psrldq m3, m2, 2 + punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] + paddd m0, m1 + movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] + paddd m0, m2 + vextracti128 xm2, m0, 1 + paddd xm0, xm2 +.x_loop_ar3_inner: + pmaddwd xm2, xm1, xm10 + pshuflw xm3, xm2, q1032 + paddd xm2, xm0 ; add top + paddd xm2, xm3 ; left+cur + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + psrldq xm1, 2 + ; no need to packssdw since we only care about one value + pminsd xm2, xm12 + pmaxsd xm2, xm13 + pextrw [bufq+xq*2], xm2, 0 + pslldq xm2, 4 + pblendw xm1, xm2, 00000100b + inc xq + jz .x_loop_ar3_end + test xb, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 +.x_loop_ar3_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, unused, sby, see +%define base r11-grain_min + lea r11, [grain_min] + mov r6d, r9m ; bdmax + mov r9d, [fg_dataq+FGData.clip_to_restricted_range] + mov r7d, [fg_dataq+FGData.scaling_shift] + mov sbyd, sbym + vpbroadcastd m8, r9m + shr r6d, 11 ; is_12bpc + vpbroadcastd m9, [base+grain_min+r6*4] + shlx r10d, r9d, r6d + vpbroadcastd m10, [base+grain_max+r6*4] + lea r9d, [r6+r9*4] + vpbroadcastw m11, [base+mul_bits+r7*2-12] + vpbroadcastd m12, [base+fg_min+r10*4] + vpbroadcastd m13, [base+fg_max+r9*4] + test sbyd, sbyd + setnz r7b + vpbroadcastd m14, [base+pd_16] + test r7b, [fg_dataq+FGData.overlap_flag] + jnz .vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak + + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m4, m2, 0x55 + psrld m2, m1, 16 + mova m9, m6 + pand m2, m8 + vpgatherdd m5, [scalingq+m2-2], m6 + pblendw m5, m3, 0x55 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m4, [grain_lutq+offxyq*2] + pmulhrsw m5, [grain_lutq+offxyq*2+32] + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je .loop_x + movq xm7, [pw_27_17_17_27] + cmp dword r8m, 0 ; sby + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy + + lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m4, m2, 0x55 + psrld m2, m1, 16 + mova m9, m6 + pand m2, m8 + vpgatherdd m5, [scalingq+m2-2], m6 + pblendw m5, m3, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] + movd xm6, [grain_lutq+left_offxyq*2] + punpcklwd xm6, xm3 + pmaddwd xm6, xm7 + paddd xm6, xm14 + psrad xm6, 5 + packssdw xm6, xm6 + pmaxsw xm6, xm9 + pminsw xm6, xm10 + vpblendd m3, m6, 0x01 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m4, m3 + pmulhrsw m5, [grain_lutq+offxyq*2+32] + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hd + jg .loop_y_h_overlap + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp dword r8m, 0 ; sby + jne .loop_x_hv_overlap + jmp .loop_x_h_overlap + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see, src_bak + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x_v_overlap: + vpbroadcastd m15, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, unused, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, unused, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +.loop_y_v_overlap: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m9, m6 + pand m4, m8 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m6, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+top_offxyq*2] + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + pmaddwd m4, m15 + pmaddwd m5, m15 + movu m7, [grain_lutq+offxyq*2+32] + movu m6, [grain_lutq+top_offxyq*2+32] + paddd m4, m14 + paddd m5, m14 + psrad m4, 5 + psrad m5, 5 + packssdw m4, m5 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 + pmaddwd m5, m15 + pmaddwd m6, m15 + paddd m5, m14 + paddd m6, m14 + psrad m5, 5 + psrad m6, 5 + packssdw m5, m6 + pmaxsw m4, m9 + pmaxsw m5, m9 + pminsw m4, m10 + pminsw m5, m10 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m11 + pmaddubsw m3, m11 + paddw m2, m2 + paddw m3, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hb + jz .end_y_v_overlap + vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + add hd, 0x80000000 + jnc .loop_y_v_overlap + jmp .loop_y +.end_y_v_overlap: + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +.loop_x_hv_overlap: + vpbroadcastd m15, [pw_27_17_17_27] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyd, [top_offxyq+32] + lea left_offxyd, [offyq+32] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +.loop_y_hv_overlap: + ; scaling[src] + mova m0, [srcq+ 0] + mova m1, [srcq+32] + pand m4, m8, m0 + psrld m3, m0, 16 + mova m6, m9 + vpgatherdd m2, [scalingq+m4-0], m9 + pand m3, m8 + mova m9, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pand m5, m8, m1 + mova m6, m9 + vpgatherdd m3, [scalingq+m5-0], m9 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m9, m6 + pand m4, m8 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m7, [grain_lutq+offxyq*2] + movd xm6, [grain_lutq+left_offxyq*2] + movu m5, [grain_lutq+top_offxyq*2] + movd xm4, [grain_lutq+topleft_offxyq*2] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklwd xm6, xm7 + punpcklwd xm4, xm5 + punpcklqdq xm6, xm4 + movddup xm4, [pw_27_17_17_27] + pmaddwd xm6, xm4 + paddd xm6, xm14 + psrad xm6, 5 + packssdw xm6, xm6 + pmaxsw xm6, xm9 + pminsw xm6, xm10 + pshuflw xm4, xm6, q1032 + vpblendd m6, m7, 0xfe + vpblendd m4, m5, 0xfe + ; followed by v interpolation (top | cur -> cur) + punpckhwd m5, m7 + pmaddwd m5, m15 + punpcklwd m4, m6 + pmaddwd m4, m15 + movu m7, [grain_lutq+offxyq*2+32] + movu m6, [grain_lutq+top_offxyq*2+32] + paddd m5, m14 + paddd m4, m14 + psrad m5, 5 + psrad m4, 5 + packssdw m4, m5 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 + pmaddwd m5, m15 + pmaddwd m6, m15 + paddd m5, m14 + paddd m6, m14 + psrad m5, 5 + psrad m6, 5 + packssdw m5, m6 + pmaxsw m4, m9 + pmaxsw m5, m9 + pminsw m4, m10 + pminsw m5, m10 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m11 + pmaddubsw m3, m11 + paddw m2, m2 + paddw m3, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+32], m1 + + add srcq, strideq + add grain_lutq, 82*2 + dec hb + jz .end_y_hv_overlap + vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + add hd, 0x80000000 + jnc .loop_y_hv_overlap + movq xm7, [pw_27_17_17_27] + jmp .loop_y_h_overlap +.end_y_hv_overlap: + add wq, 32 + lea srcq, [src_bakq+wq*2] + jl .loop_x_hv_overlap +.end: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r12-grain_min + lea r12, [grain_min] + mov r9d, r13m ; bdmax + mov r7d, [fg_dataq+FGData.scaling_shift] + mov r11d, is_idm + mov sbyd, sbym + vpbroadcastw m11, [base+mul_bits+r7*2-12] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + shr r9d, 11 ; is_12bpc + vpbroadcastd m8, [base+grain_min+r9*4] + shlx r10d, r6d, r9d + vpbroadcastd m9, [base+grain_max+r9*4] + vpbroadcastw m10, r13m + shlx r6d, r6d, r11d + vpbroadcastd m12, [base+fg_min+r10*4] + lea r6d, [r9+r6*2] + vpbroadcastd m13, [base+fg_max+r6*4] + test sbyd, sbyd + setnz r7b + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused, sby, see, overlap + +%if %1 + mov r6d, r11m + vpbroadcastd m0, [base+pb_8_9_0_1] + vpbroadcastd m1, [base+uv_offset_mul+r9*4] + vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] + vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4] + pshufb m14, m0 ; { uv_luma_mult, uv_mult } + pmaddwd m15, m1 +%else +%if %2 + vpbroadcastq m15, [base+pw_23_22] +%else + vpbroadcastq m15, [base+pw_27_17_17_27] +%endif + vpbroadcastd m14, [base+pd_16] +%endif + test r7b, [fg_dataq+FGData.overlap_flag] + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused2, unused3, see, unused4, unused5, unused6, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r9mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, unused2, unused3, luma, lstride + + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, unused2, unused3, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq+ 0] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 + pminuw m3, m10 ; clip_pixel() +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m4, [grain_lutq+offxyq*2] +%if %2 + pmulhrsw m5, [grain_lutq+offxyq*2+82*2] +%else + pmulhrsw m5, [grain_lutq+offxyq*2+32] +%endif + + ; dst = clip_pixel(src, noise) +%if %1 + paddw m0, m4 + paddw m1, m5 +%else + paddw m0, m4, [srcq] +%if %2 + paddw m1, m5, [srcq+strideq] +%else + paddw m1, m5, [srcq+32] +%endif +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 +%else + dec hb +%endif + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je %%loop_x + cmp dword r8m, 0 ; sby + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, luma, lstride + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 ; clip_pixel() + pminuw m3, m10 +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m2, [grain_lutq+offxyq*2] +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif + movd xm6, [grain_lutq+left_offxyq*2] +%if %2 + pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} + punpckldq xm7, xm2, xm3 ; {cur0, cur1} + punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1} +%else + punpcklwd xm6, xm2 +%endif +%if %1 +%if %2 + vpbroadcastq xm7, [pw_23_22] +%else + movq xm7, [pw_27_17_17_27] +%endif + pmaddwd xm6, xm7 + vpbroadcastd xm7, [pd_16] + paddd xm6, xm7 +%else + pmaddwd xm6, xm15 + paddd xm6, xm14 +%endif + psrad xm6, 5 + packssdw xm6, xm6 + pmaxsw xm6, xm8 + pminsw xm6, xm9 + vpblendd m2, m6, 0x01 +%if %2 + pshuflw xm6, xm6, q1032 + vpblendd m3, m6, 0x01 +%endif + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) +%if %1 + paddw m0, m2 + paddw m1, m3 +%else + paddw m0, m2, [srcq] +%if %2 + paddw m1, m3, [srcq+strideq] +%else + paddw m1, m3, [srcq+32] +%endif +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, r10mp +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 +%else + dec hb +%endif + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp dword r8m, 0 ; sby + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, unused1, unused2, unused3, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, top_offxy, unused2, luma, lstride + + mov lumaq, r9mp + mov lstrideq, r10mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r9mp, r10 + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, top_offxy, unused2, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +%if %2 == 0 + lea r10, [pw_27_17_17_27] +%endif +%%loop_y_v_overlap: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 ; clip_pixel() + pminuw m3, m10 +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m6, [grain_lutq+offxyq*2] + movu m3, [grain_lutq+top_offxyq*2] + punpcklwd m2, m3, m6 + punpckhwd m3, m6 ; { top, cur } +%if %3 + vpbroadcastd m0, [pw_23_22] +%elif %2 + vpbroadcastd m0, [pw_27_17_17_27] +%else + vpbroadcastd m0, [r10] +%endif + REPX {pmaddwd x, m0}, m2, m3 +%if %1 + vpbroadcastd m1, [pd_16] + REPX {paddd x, m1}, m2, m3 +%else + REPX {paddd x, m14}, m2, m3 +%endif + REPX {psrad x, 5}, m2, m3 + packssdw m2, m3 +%if %2 + movu m3, [grain_lutq+offxyq*2+82*2] +%else + movu m3, [grain_lutq+offxyq*2+32] +%endif +%if %3 + pmaxsw m2, m8 + pminsw m2, m9 +%else +%if %2 + movu m7, [grain_lutq+top_offxyq*2+82*2] + punpckhwd m6, m3, m7 ; { cur, top } + punpcklwd m3, m7 +%else + movu m7, [grain_lutq+top_offxyq*2+32] + punpckhwd m6, m7, m3 + punpcklwd m3, m7, m3 ; { top, cur } +%endif + pmaddwd m6, m0 + pmaddwd m3, m0 +%if %1 + paddd m6, m1 + paddd m3, m1 +%else + paddd m6, m14 + paddd m3, m14 +%endif + psrad m6, 5 + psrad m3, 5 + packssdw m3, m6 + pmaxsw m2, m8 + pmaxsw m3, m8 + pminsw m2, m9 + pminsw m3, m9 +%endif + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2, [srcq] +%if %2 + paddw m1, m3, [srcq+strideq] +%else + paddw m1, m3, [srcq+32] +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + sub hb, 2 +%else + mova [dstq+32], m1 + dec hb +%endif + jle %%end_y_v_overlap +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + jmp %%loop_y +%else + add hd, 0x80000000 + jc %%loop_y + add r10, 4 + jmp %%loop_y_v_overlap +%endif +%%end_y_v_overlap: + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + +%if %2 == 0 + lea r14, [pw_27_17_17_27] +%endif + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +%%loop_y_hv_overlap: + ; luma_src +%if %2 + mova xm2, [lumaq+lstrideq*0+ 0] + vinserti128 m2, [lumaq+lstrideq*0+32], 1 + mova xm4, [lumaq+lstrideq*0+16] + vinserti128 m4, [lumaq+lstrideq*0+48], 1 + mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] + vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 + mova xm5, [lumaq+lstrideq*(1<<%3)+16] + vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 + phaddw m2, m4 + phaddw m3, m5 + pxor m4, m4 + pavgw m2, m4 + pavgw m3, m4 +%elif %1 + mova m2, [lumaq] + mova m3, [lumaq+32] +%endif +%if %1 + mova m0, [srcq] +%if %2 + mova m1, [srcq+strideq] +%else + mova m1, [srcq+32] +%endif + punpckhwd m4, m2, m0 + punpcklwd m2, m0 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m4, m2, m5, m3 + REPX {paddd x, m15}, m4, m2, m5, m3 + REPX {psrad x, 6 }, m4, m2, m5, m3 + packusdw m2, m4 + packusdw m3, m5 + pminuw m2, m10 ; clip_pixel() + pminuw m3, m10 +%elif %2 + pand m2, m10 + pand m3, m10 +%else + pand m2, m10, [lumaq+ 0] + pand m3, m10, [lumaq+32] +%endif + + ; scaling[luma_src] + vpbroadcastd m7, [pd_m65536] + pandn m4, m7, m2 + mova m6, m7 + vpgatherdd m5, [scalingq+m4-0], m7 + psrld m2, 16 + mova m7, m6 + vpgatherdd m4, [scalingq+m2-2], m6 + pblendw m4, m5, 0x55 + pandn m5, m7, m3 + mova m6, m7 + vpgatherdd m2, [scalingq+m5-0], m7 + psrld m3, 16 + vpgatherdd m5, [scalingq+m3-2], m6 + pblendw m5, m2, 0x55 + + ; grain = grain_lut[offy+y][offx+x] + movu m0, [grain_lutq+offxyq*2] + movd xm2, [grain_lutq+left_offxyq*2] + movu m6, [grain_lutq+top_offxyq*2] +%if %2 + pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2 + movu m3, [grain_lutq+offxyq*2+82*2] + punpckldq xm1, xm0, xm3 ; { cur0, cur1 } +%if %3 + vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } + vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 } +%else + vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1 + vpbroadcastd m7, [grain_lutq+topleft_offxyq*2] + vpblendd m2, m7, 0x20 + movd xm7, [grain_lutq+top_offxyq*2+82*2] + punpckldq xm7, xm6 + vinserti128 m1, xm7, 1 + movu m7, [grain_lutq+top_offxyq*2+82*2] +%endif + punpcklwd m2, m1 ; { cur, left } +%if %1 + vpbroadcastq m1, [pw_23_22] + pmaddwd m2, m1 + vpbroadcastd m1, [pd_16] + paddd m2, m1 + psrad m2, 5 + packssdw m2, m2 + vpermq m2, m2, q3120 +%else + pmaddwd m2, m15 + paddd m2, m14 + psrad m2, 5 + vextracti128 xm1, m2, 1 + packssdw xm2, xm1 +%endif +%else + pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1 + movu m3, [grain_lutq+offxyq*2+32] + movu m7, [grain_lutq+top_offxyq*2+32] + punpckldq xm1, xm0, xm6 + punpcklwd xm2, xm1 ; { cur, left } +%if %1 + movddup xm1, [pw_27_17_17_27] + pmaddwd xm2, xm1 + vpbroadcastd m1, [pd_16] + paddd xm2, xm1 +%else + pmaddwd xm2, xm15 + paddd xm2, xm14 +%endif + psrad xm2, 5 + packssdw xm2, xm2 +%endif + pmaxsw xm2, xm8 + pminsw xm2, xm9 + vpblendd m0, m2, 0x01 +%if %2 + pshufd xm2, xm2, q0321 + vpblendd m3, m2, 0x01 +%if %3 == 0 + pshufd xm2, xm2, q0321 + vpblendd m7, m2, 0x01 +%endif +%endif + pshuflw xm2, xm2, q1032 + vpblendd m2, m6, 0xfe + punpckhwd m6, m0 ; { top, cur } + punpcklwd m2, m0 +%if %3 + vpbroadcastd m0, [pw_23_22] +%elif %2 + vpbroadcastd m0, [pw_27_17_17_27] +%else + vpbroadcastd m0, [r14] +%endif + pmaddwd m6, m0 + pmaddwd m2, m0 +%if %1 + paddd m6, m1 + paddd m2, m1 +%else + paddd m6, m14 + paddd m2, m14 +%endif + psrad m6, 5 + psrad m2, 5 + packssdw m2, m6 + +%if %3 + pmaxsw m2, m8 + pminsw m2, m9 +%else +%if %2 + punpckhwd m6, m3, m7 + punpcklwd m3, m7 ; { cur, top } +%else + punpckhwd m6, m7, m3 + punpcklwd m3, m7, m3 ; { top, cur } +%endif + REPX {pmaddwd x, m0}, m6, m3 +%if %1 + REPX {paddd x, m1}, m6, m3 +%else + REPX {paddd x, m14}, m6, m3 +%endif + REPX {psrad x, 5}, m6, m3 + packssdw m3, m6 + pmaxsw m2, m8 + pmaxsw m3, m8 + pminsw m2, m9 + pminsw m3, m9 +%endif + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m4, m4 + paddw m5, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2, [srcq] +%if %2 + paddw m1, m3, [srcq+strideq] +%else + paddw m1, m3, [srcq+32] +%endif + pmaxsw m0, m12 + pmaxsw m1, m12 + pminsw m0, m13 + pminsw m1, m13 + mova [dstq], m0 +%if %2 + mova [dstq+strideq], m1 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + mova [dstq+32], m1 + add srcq, strideq + add dstq, strideq + add lumaq, r10mp +%endif + add grain_lutq, 82*(2<<%2) +%if %2 + sub hb, 2 + jg %%loop_y_h_overlap +%else + dec hb + jle %%end_y_hv_overlap + add hd, 0x80000000 + jc %%loop_y_h_overlap + add r14, 4 + jmp %%loop_y_hv_overlap +%endif +%%end_y_hv_overlap: + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + jmp %%loop_x_hv_overlap +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET +%endmacro + +GEN_GRAIN_UV_FN 420, 1, 1 +FGUV_FN 420, 1, 1 +GEN_GRAIN_UV_FN 422, 1, 0 +FGUV_FN 422, 1, 0 +GEN_GRAIN_UV_FN 444, 0, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/filmgrain16_avx512.asm b/third_party/dav1d/src/x86/filmgrain16_avx512.asm new file mode 100644 index 0000000000..00dd6af599 --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain16_avx512.asm @@ -0,0 +1,932 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 +pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 + db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 +scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1 +scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4 +pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27 +pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32 +fg_min: times 2 dw 0 + times 2 dw 64 + times 2 dw 256 +fg_max: times 2 dw 1023 + times 2 dw 4095 + times 2 dw 960 + times 2 dw 3840 + times 2 dw 940 + times 2 dw 3760 +scale_rnd: dd 64 + dd 16 +uv_offset_mul: dd 256 + dd 1024 +pb_8_9_0_1: db 8, 9, 0, 1 + +SECTION .text + +INIT_ZMM avx512icl +cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \ + grain_lut, offx, sby, see, offy, src_bak +%define base r11-fg_min + lea r11, [fg_min] + mov r6d, r9m ; bdmax + mov r9d, [fg_dataq+FGData.clip_to_restricted_range] + mov r7d, [fg_dataq+FGData.scaling_shift] + mov sbyd, sbym + vpbroadcastd m6, r9m + shr r6d, 11 ; is_12bpc + vbroadcasti32x4 m7, [base+scale_mask] + shlx r10d, r9d, r6d + vpbroadcastd m10, [base+scale_shift+r7*4-32] + lea r9d, [r6+r9*4] + vpbroadcastd m8, [base+fg_min+r10*4] + kxnorw k1, k1, k1 ; 0xffff + vpbroadcastd m9, [base+fg_max+r9*4] + mov r12, 0xeeeeeeeeeeeeeeee + vpbroadcastd m19, [base+scale_rnd+r6*4] + kshiftrb k2, k1, 4 ; 0xf + vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8] + kmovq k3, r12 + vpbroadcastd m11, [base+scale_shift+r6*8+4] + test sbyd, sbyd + setnz r7b + vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0] + vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4] + test r7b, [fg_dataq+FGData.overlap_flag] + jnz .v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y: + movu m4, [grain_lutq+offxyq*2+82*0] + movu m5, [grain_lutq+offxyq*2+82*2] + call .add_noise + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je .loop_x + test sbyd, sbyd + jnz .hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ + sby, see, offy, src_bak, left_offxy + + lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + movu m4, [grain_lutq+offxyq*2+82*0] + movu m5, [grain_lutq+offxyq*2+82*2] + movd xm17, [grain_lutq+left_offxyq*2-82*1] + pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1 + punpckldq xm16, xm4, xm5 + punpcklwd xm17, xm16 + mova xm16, xm19 + vpdpwssd xm16, xm20, xm17 + psrad xm16, 1 + packssdw xm16, xm16 + vpsravw xm16, xm11 + vmovdqu8 m4{k2}, m16 + vpalignr m5{k2}, m16, m16, 4 + call .add_noise + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + test sbyd, sbyd + jnz .hv_overlap + jmp .loop_x_h_overlap + +.v_overlap: + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ + sby, see, offy, src_bak, _, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak, _, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + + movu m16, [grain_lutq+offxyq*2+82*0] + movu m0, [grain_lutq+top_offxyq*2+82*0] + movu m17, [grain_lutq+offxyq*2+82*2] + movu m1, [grain_lutq+top_offxyq*2+82*2] + punpckhwd m4, m0, m16 + punpcklwd m0, m16 + punpckhwd m5, m1, m17 + punpcklwd m1, m17 + call .add_noise_v + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump back + ; to .v_overlap, and instead always fall-through to .hv_overlap +.hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ + sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyd, [top_offxyq+73] + lea left_offxyd, [offyq+73] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + + movu m5, [grain_lutq+offxyq*2+82*0] + movu m0, [grain_lutq+top_offxyq*2+82*0] + movd xm17, [grain_lutq+left_offxyq*2-82*1] + pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1 + movu m2, [grain_lutq+offxyq*2+82*2] + movu m1, [grain_lutq+top_offxyq*2+82*2] + movd xm18, [grain_lutq+left_offxyq*2+82*1] + pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1 + punpckldq xm16, xm5, xm0 + punpcklwd xm17, xm16 + mova xm16, xm19 + vpdpwssd xm16, xm20, xm17 + punpckldq xm17, xm2, xm1 + punpcklwd xm18, xm17 + mova xm17, xm19 + vpdpwssd xm17, xm20, xm18 + punpckhwd m4, m0, m5 + punpcklwd m0, m5 + punpckhwd m5, m1, m2 + punpcklwd m1, m2 + psrad xm16, 1 + psrad xm17, 1 + packssdw xm16, xm17 + vpsravw xm16, xm11 + vpshuflw m0{k2}, m16, q1302 + punpckhqdq xm16, xm16 + vpshuflw m1{k2}, m16, q1302 + call .add_noise_v + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + lea srcq, [src_bakq+wq*2] + jl .hv_overlap +.end: + RET +ALIGN function_align +.add_noise_v: + mova m2, m19 + vpdpwssd m2, m12, m4 + mova m3, m19 + vpdpwssd m3, m13, m5 + mova m4, m19 + vpdpwssd m4, m12, m0 + mova m5, m19 + vpdpwssd m5, m13, m1 + REPX {psrad x, 1}, m2, m3, m4, m5 + packssdw m4, m2 + packssdw m5, m3 + vpsravw m4, m11 + vpsravw m5, m11 +.add_noise: + mova m0, [srcq+strideq*0] + mova m1, [srcq+strideq*1] + kmovw k4, k1 + pand m16, m6, m0 + psrld m3, m0, 16 + vpgatherdd m2{k4}, [scalingq+m16] + vpcmpud k4, m3, m6, 2 ; px <= bdmax + vpgatherdd m16{k4}, [scalingq+m3] + kmovw k4, k1 + pand m17, m6, m1 + vpgatherdd m3{k4}, [scalingq+m17] + vpshufb m2{k3}, m16, m7 + psrld m16, m1, 16 + vpcmpud k4, m16, m6, 2 + vpgatherdd m17{k4}, [scalingq+m16] + vpshufb m3{k3}, m17, m7 + vpsllvw m2, m10 + vpsllvw m3, m10 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + add grain_lutq, 82*4 + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m8 + pmaxsw m1, m8 + pminsw m0, m9 + pminsw m1, m9 + mova [dstq+srcq], m0 + add srcq, strideq + mova [dstq+srcq], m1 + add srcq, strideq + ret + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r12-fg_min + lea r12, [fg_min] + mov r9d, r13m ; bdmax + mov r7d, [fg_dataq+FGData.scaling_shift] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + mov r11d, is_idm + kxnorw k1, k1, k1 ; 0xffff + vpbroadcastd m5, r13m + mov r13, 0xeeeeeeeeeeeeeeee + vbroadcasti32x4 m6, [base+scale_mask] + shr r9d, 11 ; is_12bpc + vpbroadcastd m7, [base+scale_shift+r7*4-32] + shlx r10d, r6d, r9d + mov sbyd, sbym + shlx r6d, r6d, r11d + vpbroadcastd m8, [base+fg_min+r10*4] + lea r6d, [r9+r6*2] + vpbroadcastd m9, [base+fg_max+r6*4] + kmovq k2, r13 + vpbroadcastd m20, [base+scale_rnd+r9*4] + packssdw m4, m5, m5 + vpbroadcastd m21, [base+scale_shift+r9*8+4] +%if %2 + mova m12, [base+pb_0to63] ; pw_even + mov r13d, 0x0101 + vpbroadcastq m10, [base+pw_23_22+r9*8] + kmovw k3, r13d +%if %3 + pshufd m11, m10, q0000 +%else + vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0] + vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4] + vmovdqu16 m11{k1}, m16 +%endif + psrlw m13, m12, 8 ; pw_odd +%else + vpbroadcastq m10, [base+pw_27_17_17_27+r9*8] + kshiftrb k3, k1, 7 ; 0x01 + kshiftrb k4, k1, 4 ; 0x0f + pshufd m11, m10, q0000 +%endif + mov lstrideq, r10mp + test sbyd, sbyd + setnz r7b + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + _, sby, see, lstride + +%if %1 + mov r6d, r11m + vpbroadcastd m0, [base+uv_offset_mul+r9*4] + vpbroadcastd m1, [base+pb_8_9_0_1] + vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4] + vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4] + pmaddwd m14, m0 + pshufb m15, m1 ; { uv_luma_mult, uv_mult } +%endif + test r7b, [fg_dataq+FGData.overlap_flag] + jnz %%v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma + + mov lumaq, r9mp + lea r12, [srcq+wq*2] + lea r13, [dstq+wq*2] + lea r14, [lumaq+wq*(2<<%2)] + mov r9mp, r12 + mov r10mp, r13 + mov r11mp, r14 + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: +%if %2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+offxyq*2+82*2] +%endif + call %%add_noise + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je %%loop_x + cmp dword r8m, 0 ; sby + jne %%hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma, left_offxy + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: +%if %2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + movd xm16, [grain_lutq+left_offxyq*2+82*0] + vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 + movd xm17, [grain_lutq+left_offxyq*2+82*4] + vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 + punpckldq m16, m17 + punpckldq m17, m18, m19 + punpcklwd m16, m17 + mova m17, m20 + vpdpwssd m17, m16, m10 + psrad m17, 1 + packssdw m17, m17 + vpsravw m17, m21 +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+offxyq*2+82*2] + movd xm16, [grain_lutq+left_offxyq*2+82*0] + pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1 + punpckldq xm17, xm18, xm19 + punpcklwd xm16, xm17 + mova xm17, xm20 + vpdpwssd xm17, xm16, xm10 + psrad xm17, 1 + packssdw xm17, xm17 + vpsravw xm17, xm21 +%endif + vmovdqa32 m18{k3}, m17 + vpshufd m19{k3}, m17, q0321 + call %%add_noise + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp dword r8m, 0 ; sby + jne %%hv_overlap + jmp %%loop_x_h_overlap + +%%v_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + _, sby, see, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma, _, top_offxy + + mov lumaq, r9mp + lea r12, [srcq+wq*2] + lea r13, [dstq+wq*2] + lea r14, [lumaq+wq*(2<<%2)] + mov r9mp, r12 + mov r10mp, r13 + mov r11mp, r14 + neg wq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma, _, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + +%if %3 + movu ym16, [grain_lutq+offxyq*2+82*0] + movu ym1, [grain_lutq+top_offxyq*2+82*0] + vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2] + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + punpcklwd ym17, ym1, ym16 + punpckhwd ym1, ym16 +%elif %2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym17, [grain_lutq+top_offxyq*2+82*0] + vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + punpcklwd m16, m17, m18 + punpckhwd m17, m18 +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+top_offxyq*2+82*0] + movu m2, [grain_lutq+offxyq*2+82*2] + movu m16, [grain_lutq+top_offxyq*2+82*2] + punpckhwd m1, m19, m18 + punpcklwd m19, m18 + punpckhwd m18, m2, m16 + punpcklwd m2, m16 +%endif + call %%add_noise_v + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump back + ; to %%v_overlap, and instead always fall-through to %%hv_overlap +%%hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movd xm16, [grain_lutq+left_offxyq*2+82*0] + vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 + movd xm17, [grain_lutq+left_offxyq*2+82*4] + vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + punpckldq m16, m17 + punpckldq m17, m18, m19 + punpcklwd m16, m17 + movu ym1, [grain_lutq+top_offxyq*2+82*0] + movd xm17, [grain_lutq+topleft_offxyq*2+82*0] + mova m0, m20 + vpdpwssd m0, m16, m10 +%if %3 + punpcklwd xm17, xm1 + mova xm16, xm20 + vpdpwssd xm16, xm17, xm10 + psrad xm16, 1 +%else + vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1 + vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2 + punpcklwd m17, m1 + mova m16, m20 + vpdpwssd m16, m17, m10 + psrad m16, 1 +%endif + psrad m0, 1 + packssdw m0, m16 + vpsravw m0, m21 + vmovdqa32 m18{k3}, m0 + vpshufd m19{k3}, m0, q0321 +%if %3 + vpunpckhdq ym1{k3}, ym0, ym0 + punpcklwd ym17, ym1, ym18 + punpckhwd ym1, ym18 +%else + vpunpckhdq m1{k3}, m0, m0 + punpcklwd m16, m1, m18 + punpckhwd m17, m1, m18 +%endif +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+top_offxyq*2+82*0] + movd xm17, [grain_lutq+left_offxyq*2+82*0] + pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1 + punpckldq xm16, xm18, xm19 + punpcklwd xm17, xm16 + movu m2, [grain_lutq+offxyq*2+82*2] + movu m0, [grain_lutq+top_offxyq*2+82*2] + movd xm16, [grain_lutq+left_offxyq*2+82*2] + pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1 + punpckldq xm1, xm2, xm0 + punpcklwd xm1, xm16, xm1 + mova xm16, xm20 + vpdpwssd xm16, xm17, xm10 + mova xm17, xm20 + vpdpwssd xm17, xm1, xm10 + punpckhwd m1, m19, m18 + punpcklwd m19, m18 + punpckhwd m18, m2, m0 + punpcklwd m2, m0 + psrad xm16, 1 + psrad xm17, 1 + packssdw xm16, xm17 + vpsravw xm16, xm21 + vpshuflw m19{k4}, m16, q1302 + punpckhqdq xm16, xm16 + vpshuflw m2{k4}, m16, q3120 +%endif + call %%add_noise_v + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + jmp %%hv_overlap + +ALIGN function_align +%%add_noise_v: +%if %3 + mova ym16, ym20 + vpdpwssd ym16, ym17, ym11 + mova ym17, ym20 + vpdpwssd ym17, ym1, ym11 + psrad ym16, 1 + psrad ym17, 1 + packssdw ym16, ym17 + vpsravw m18{k1}, m16, m21 +%elif %2 + mova m18, m20 + vpdpwssd m18, m16, m11 + mova m16, m20 + vpdpwssd m16, m17, m11 + psrad m18, 1 + psrad m16, 1 + packssdw m18, m16 + vpsravw m18, m21 +%else + mova m16, m20 + vpdpwssd m16, m1, m11 + mova m17, m20 + vpdpwssd m17, m18, m11 + mova m18, m20 + vpdpwssd m18, m19, m11 + mova m19, m20 + vpdpwssd m19, m2, m11 + REPX {psrad x, 1}, m16, m17, m18, m19 + packssdw m18, m16 + packssdw m19, m17 + vpsravw m18, m21 + vpsravw m19, m21 +%endif +%%add_noise: +%if %2 + mova m2, [lumaq+lstrideq*(0<<%3)] + mova m0, [lumaq+lstrideq*(1<<%3)] + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova m3, [lumaq+lstrideq*(0<<%3)] + mova m1, [lumaq+lstrideq*(1<<%3)] + mova m16, m12 + vpermi2w m16, m2, m0 + vpermt2w m2, m13, m0 + mova m17, m12 + vpermi2w m17, m3, m1 + vpermt2w m3, m13, m1 + pavgw m2, m16 + pavgw m3, m17 +%elif %1 + mova m2, [lumaq+lstrideq*0] + mova m3, [lumaq+lstrideq*1] +%endif +%if %2 + mova ym16, [srcq+strideq*0] + vinserti32x8 m16, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] +%else + mova m16, [srcq+strideq*0] +%endif +%if %1 + punpckhwd m17, m2, m16 + mova m0, m14 + vpdpwssd m0, m17, m15 + punpcklwd m17, m2, m16 + mova m2, m14 + vpdpwssd m2, m17, m15 +%endif +%if %2 + mova ym17, [srcq+strideq*0] + vinserti32x8 m17, [srcq+strideq*1], 1 +%else + mova m17, [srcq+strideq*1] +%endif +%if %1 + psrad m0, 6 + psrad m2, 6 + packusdw m2, m0 + punpckhwd m0, m3, m17 + mova m1, m14 + vpdpwssd m1, m15, m0 + punpcklwd m0, m3, m17 + mova m3, m14 + vpdpwssd m3, m15, m0 + psrad m1, 6 + psrad m3, 6 + packusdw m3, m1 + pminuw m2, m4 + pminuw m3, m4 + +.add_noise_main: + ; scaling[luma_src] + kmovw k5, k1 + pand m1, m5, m2 + vpgatherdd m0{k5}, [scalingq+m1] + kmovw k5, k1 + psrld m2, 16 + vpgatherdd m1{k5}, [scalingq+m2] + vpshufb m0{k2}, m1, m6 + kmovw k5, k1 + psrld m1, m3, 16 + vpgatherdd m2{k5}, [scalingq+m1] + kmovw k5, k1 + pand m3, m5 + vpgatherdd m1{k5}, [scalingq+m3] + vpshufb m1{k2}, m2, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + vpsllvw m0, m7 + vpsllvw m1, m7 + pmulhrsw m18, m0 + pmulhrsw m19, m1 + add grain_lutq, 82*(4<<%2) + lea lumaq, [lumaq+lstrideq*(2<<%3)] + lea srcq, [srcq+strideq*2] + paddw m16, m18 + paddw m17, m19 + pmaxsw m16, m8 + pmaxsw m17, m8 + pminsw m16, m9 + pminsw m17, m9 +%if %2 + mova [dstq+strideq*0], ym16 + vextracti32x8 [dstq+strideq*1], m16, 1 + lea dstq, [dstq+strideq*2] + mova [dstq+strideq*0], ym17 + vextracti32x8 [dstq+strideq*1], m17, 1 +%else + mova [dstq+strideq*0], m16 + mova [dstq+strideq*1], m17 +%endif + lea dstq, [dstq+strideq*2] + ret +%else +%if %2 + pand m2, m4 + pand m3, m4 +%else + pand m2, m4, [lumaq+lstrideq*0] + pand m3, m4, [lumaq+lstrideq*1] +%endif + jmp .add_noise_main +%endif +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 + +%endif diff --git a/third_party/dav1d/src/x86/filmgrain16_sse.asm b/third_party/dav1d/src/x86/filmgrain16_sse.asm new file mode 100644 index 0000000000..6b0daaac0b --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain16_sse.asm @@ -0,0 +1,3421 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +SECTION_RODATA 16 +pd_16: times 4 dd 16 +pw_1: times 8 dw 1 +pw_16384: times 8 dw 16384 +pw_8192: times 8 dw 8192 +pw_23_22: dw 23, 22 + times 3 dw 0, 32 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +pw_27_17_17_27: dw 27, 17, 17, 27 + times 2 dw 0, 32 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512, 1024 +max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 +min: dw 0, 16*4, 16*16 +; these two should be next to each other +pw_4: times 2 dw 4 +pw_16: times 2 dw 16 + +%macro JMP_TABLE 1-* + %xdefine %1_table %%table + %xdefine %%base %1_table + %xdefine %%prefix mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%prefix %+ .ar%2 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 + +SECTION .text + +%if ARCH_X86_32 +%undef base +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg +%assign %%idx 0 +%define %%tmp %2 +%if %0 == 8 +%define %%tmp %8 +%endif +%rep (%6/2) +%if %%idx == 0 + movd %5 %+ d, %2 + pshuflw %%tmp, %2, q3232 +%else + movd %5 %+ d, %%tmp +%if %6 == 8 +%if %%idx == 2 + punpckhqdq %%tmp, %%tmp +%elif %%idx == 4 + psrlq %%tmp, 32 +%endif +%endif +%endif + movzx %4 %+ d, %5 %+ w + shr %5 %+ d, 16 + +%if %%idx == 0 + movd %1, [%3+%4*%7] +%else + pinsrw %1, [%3+%4*%7], %%idx + 0 +%endif + pinsrw %1, [%3+%5*%7], %%idx + 1 +%assign %%idx %%idx+2 +%endrep +%endmacro + +%macro SPLATD 2 ; dst, src +%ifnidn %1, %2 + movd %1, %2 +%endif + pshufd %1, %1, q0000 +%endmacro + +%macro SPLATW 2 ; dst, src +%ifnidn %1, %2 + movd %1, %2 +%endif + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 +%endmacro + + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax + lea r4, [pb_mask] +%define base r4-pb_mask +%else +cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax + LEA r4, $$ +%define base r4-$$ +%endif + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r3d, [fg_dataq+FGData.grain_scale_shift] + lea r5d, [bdmaxq+1] + shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r3, r5 + SPLATW m6, [base+round+r3*2-2] + mova m5, [base+pb_mask] + SPLATW m0, [fg_dataq+FGData.seed] + mov r3, -73*82*2 + sub bufq, r3 +%if ARCH_X86_64 + lea r6, [gaussian_sequence] +%endif +.loop: + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m2, m3 + psllq m3, m2, 15 + por m2, m3 ; aggregate each bit into next seed's high bit + pmulhuw m3, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + vpgatherdw m3, m2, r6, r5, r7, 4, 2 +%else + vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 +%endif + paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw m3, m6 + movq [bufq+r3], m3 + add r3, 4*2 + jl .loop + + ; auto-regression code + movsxd r3, [fg_dataq+FGData.ar_coeff_lag] + movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] + lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] + jmp r3 + +.ar1: +%if WIN64 + DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 + lea bufq, [r0-2*(82*73-(82*3+79))] + PUSH r8 +%else +%if ARCH_X86_64 + DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 +%else ; x86-32 + DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 + PUSH r6 +%define shiftd r1d +%endif + sub bufq, 2*(82*73-(82*3+79)) +%endif + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd m4, [fg_dataq+FGData.ar_coeffs_y] + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] +%if WIN64 + DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 +%elif ARCH_X86_64 + DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 +%else ; x86-32 +%undef shiftd + DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 +%define hd dword r0m +%define maxd dword minm +%endif +%if cpuflag(sse4) + pmovsxbw m4, m4 +%else + pxor m3, m3 + pcmpgtb m3, m4 + punpcklbw m4, m3 +%endif + pinsrw m4, [base+pw_1], 3 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd + mov hd, 70 + sar maxd, 1 + mov mind, maxd + xor mind, -1 +.y_loop_ar1: + mov xq, -76 + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu m0, [bufq+xq*2-82*2-2] ; top/left + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 + punpcklwd m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar1 +%if WIN64 + POP r8 +%elif ARCH_X86_32 + POP r6 +%undef maxd +%undef hd +%endif +.ar0: + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -16*8 +%endif + DEFINE_ARGS buf, fg_data, bdmax, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m0, [base+round_vals-12+shiftq*2] + pshuflw m0, m0, q0000 + movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 + pxor m2, m2 + punpcklwd m0, m2 + pcmpgtb m2, m6 + punpckhbw m3, m6, m2 + punpcklbw m6, m2 + pshufd m2, m6, q3333 + pshufd m1, m6, q2222 + pshufd m7, m6, q1111 + pshufd m6, m6, q0000 + pshufd m4, m3, q1111 + pshufd m3, m3, q0000 +%if ARCH_X86_64 + SWAP 0, 12 + SWAP 1, 8 + SWAP 2, 9 + SWAP 3, 10 + SWAP 4, 11 +%else +%define m12 [rsp+0*16] +%define m8 [rsp+1*16] +%define m9 [rsp+2*16] +%define m10 [rsp+3*16] +%define m11 [rsp+4*16] + mova m12, m0 + mova m8, m1 + mova m9, m2 + mova m10, m3 + mova m11, m4 + mov bdmaxd, bdmaxm +%endif + sar bdmaxd, 1 + SPLATW m0, bdmaxd ; max_grain + pcmpeqw m1, m1 +%if !cpuflag(sse4) + pcmpeqw m2, m2 + psrldq m2, 14 + pslldq m2, 2 + pxor m2, m1 +%endif + pxor m1, m0 ; min_grain +%if ARCH_X86_64 + SWAP 0, 13 + SWAP 1, 14 + SWAP 2, 15 +%else +%define m13 [rsp+5*16] +%define m14 [rsp+6*16] + mova m13, m0 + mova m14, m1 +%if !cpuflag(sse4) +%define m15 [rsp+7*16] + mova m15, m2 +%endif +%endif + sub bufq, 2*(82*73-(82*3+79)) + DEFINE_ARGS buf, fg_data, h, x + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + psrldq m2, m0, 2 + psrldq m3, m0, 4 + psrldq m4, m0, 6 + psrldq m5, m0, 8 + punpcklwd m0, m2 + punpcklwd m3, m4 + punpcklwd m5, m1 + psrldq m2, m1, 2 + psrldq m4, m1, 4 + punpcklwd m2, m4 + psrldq m4, m1, 6 + psrldq m1, 8 + punpcklwd m4, m1 + pmaddwd m0, m6 + pmaddwd m3, m7 + pmaddwd m5, m8 + pmaddwd m2, m9 + pmaddwd m4, m10 + paddd m0, m3 + paddd m5, m2 + paddd m0, m4 + paddd m0, m5 ; accumulated top 2 rows + paddd m0, m12 + + movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd m4, m1, q3321 + pxor m2, m2 + pcmpgtw m2, m4 + punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] +.x_loop_ar2_inner: + pmaddwd m2, m1, m11 + paddd m2, m0 + psrldq m0, 4 ; shift top to next pixel + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + paddd m2, m4 + packssdw m2, m2 + pminsw m2, m13 + pmaxsw m2, m14 + psrldq m4, 4 + pslldq m2, 2 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000010b +%else + pand m1, m15 + pandn m3, m15, m2 + por m1, m3 +%endif + ; overwrite previous pixel, this should be ok + movd [bufq+xq*2-2], m1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar2 +%if ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +.ar3: + DEFINE_ARGS buf, fg_data, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~15 + sub rsp, 64 + %define tmp rsp +%elif ARCH_X86_64 + %define tmp rsp+stack_offset-72 +%else +%assign stack_offset stack_offset_old + ALLOC_STACK -16*12 + %define tmp rsp + mov bdmaxd, bdmaxm +%endif + sar bdmaxd, 1 + SPLATW m7, bdmaxd ; max_grain + pcmpeqw m6, m6 +%if !cpuflag(sse4) + pcmpeqw m4, m4 + psrldq m4, 14 + pslldq m4, 4 + pxor m4, m6 +%endif + pxor m6, m7 ; min_grain + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + +%if ARCH_X86_64 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m14 [rsp+10*16] +%define m15 [esp+11*16] + mova m14, m6 + mova m15, m7 +%endif + + ; build cf0-1 until 18-19 in m5-12 and r0/1 + pxor m1, m1 + movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + +%if cpuflag(sse4) + pshufd m4, m2, q3333 +%else + pshufd m5, m2, q3333 + mova [tmp+48], m5 +%endif + pshufd m3, m2, q2222 + pshufd m1, m2, q0000 + pshufd m2, m2, q1111 + pshufd m7, m0, q2222 + pshufd m6, m0, q1111 + pshufd m5, m0, q0000 + pshufd m0, m0, q3333 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 +%else +%define m8 [rsp+4*16] +%define m9 [esp+5*16] +%define m10 [rsp+6*16] +%define m11 [esp+7*16] +%define m12 [rsp+8*16] + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 +%endif + + ; build cf20,round in r2 + ; build cf21-23,round*2 in m13 + pxor m1, m1 + movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pcmpgtb m1, m0 + punpcklbw m0, m1 + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + mova [tmp+ 0], m1 + mova [tmp+16], m2 + psrldq m3, m0, 10 + pinsrw m3, [base+round_vals+shiftq*2-10], 3 + +%if ARCH_X86_64 + SWAP 3, 13 +%else +%define m13 [esp+9*16] + mova m13, m3 +%endif + + pinsrw m0, [base+round_vals+shiftq*2-12], 5 + pshufd m3, m0, q2222 + mova [tmp+32], m3 + + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 2*(82*73-(82*3+79)) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] + palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] + palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] + punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] + + pmaddwd m0, m5 + pmaddwd m2, m6 + pmaddwd m3, m7 + paddd m0, m2 + paddd m0, m3 + ; m0 = top line first 6 multiplied by cf, m1 = top line last entry + + movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] + movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] + punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] + palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] + palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] + punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] + punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + + pmaddwd m1, m8 + pmaddwd m4, m9 + pmaddwd m3, m10 + pmaddwd m2, m11 + paddd m1, m4 + paddd m3, m2 + paddd m0, m1 + paddd m0, m3 + ; m0 = top 2 lines multiplied by cf + + movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] + palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] + palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] + punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] + punpcklwd m2, [base+pw_1] + +%if cpuflag(sse4) + pmaddwd m1, m12 +%else + pmaddwd m1, [tmp+48] +%endif + pmaddwd m3, [tmp+ 0] + pmaddwd m4, [tmp+16] + pmaddwd m2, [tmp+32] + paddd m1, m3 + paddd m4, m2 + paddd m0, m1 + paddd m0, m4 + ; m0 = top 3 lines multiplied by cf plus rounding for downshift + + movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd m2, m1, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + packssdw m2, m2 + pminsw m2, m15 + pmaxsw m2, m14 + pslldq m2, 4 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000100b +%else + pand m1, m12 + pandn m3, m12, m2 + por m1, m3 +%endif + ; overwrite a couple of pixels, should be ok + movq [bufq+xq*2-4], m1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%elif ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg +%define base r8-pb_mask + lea r8, [pb_mask] + movifnidn bdmaxd, bdmaxm + lea r6d, [bdmaxq+1] +%else +cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h +%define base r2-$$ + LEA r2, $$ + mov fg_dataq, r2m + mov r6d, r4m + inc r6d +%endif + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc + sub r5, r6 + SPLATW m6, [base+round+r5*2-2] + mova m5, [base+pb_mask] + SPLATW m0, [fg_dataq+FGData.seed] +%if ARCH_X86_64 + SPLATW m2, [base+pw_seed_xor+uvq*4] +%else + mov r5d, r3m + SPLATW m2, [base+pw_seed_xor+r5*4] +%endif + pxor m0, m2 +%if ARCH_X86_64 + lea r6, [gaussian_sequence] +%endif +%if %2 + mov hd, 73-35*%3 + add bufq, 44*2 +.loop_y: + mov xq, -44 +%else + mov xq, -82*73 + add bufq, 82*73*2 +%endif +.loop_x: + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m2, m3 + psllq m3, m2, 15 + por m2, m3 ; aggregate each bit into next seed's high bit + pmulhuw m3, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + vpgatherdw m3, m2, r6, r9, r10, 4, 2 +%else + vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 +%endif + paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 + ; shifts by 0, which pmulhrsw does not support + pmulhrsw m3, m6 + movq [bufq+xq*2], m3 + add xq, 4 + jl .loop_x +%if %2 + add bufq, 82*2 + dec hd + jg .loop_y +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] + jmp r5 + +.ar0: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift +%assign stack_offset_old stack_offset + ALLOC_STACK -16*2 + mov bufyq, r1m + mov uvd, r3m +%endif + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + SPLATW m3, [base+hmul_bits+shiftq*2-10] +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m1, bdmaxd ; max_gain +%else + SPLATW m1, r4m + psraw m1, 1 +%endif + pcmpeqw m7, m7 + pxor m7, m1 ; min_grain +%if ARCH_X86_64 + SWAP 1, 14 + DEFINE_ARGS buf, bufy, h, x +%else +%define m14 [rsp+0*16] + mova m14, m1 + DEFINE_ARGS buf, bufy, pic_reg, h, x +%endif + pxor m5, m5 + pcmpgtb m5, m4 + punpcklbw m4, m5 +%if %2 + SPLATW m6, [base+hmul_bits+2+%3*2] +%endif + SPLATW m4, m4 + pxor m5, m5 +%if %2 +%if !cpuflag(sse4) + pcmpeqw m2, m2 + pslldq m2, 12 +%if ARCH_X86_64 + SWAP 2, 12 +%else +%define m12 [rsp+1*16] + mova m12, m2 +%endif +%endif +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) +%else + sub bufq, 2*(82*70-3) +%endif + add bufyq, 2*(3+82*3) + mov hd, 70-35*%3 +.y_loop_ar0: + ; first 32 pixels + xor xd, xd +.x_loop_ar0: + movu m0, [bufyq+xq*(2<<%2)] +%if %2 +%if %3 + movu m2, [bufyq+xq*4+82*2] + paddw m0, m2 +%endif + movu m1, [bufyq+xq*4 +16] +%if %3 + movu m2, [bufyq+xq*4+82*2+16] + paddw m1, m2 +%endif + phaddw m0, m1 + pmulhrsw m0, m6 +%endif + punpckhwd m1, m0, m5 + punpcklwd m0, m5 + REPX {pmaddwd x, m4}, m0, m1 + REPX {psrad x, 5}, m0, m1 + packssdw m0, m1 + pmulhrsw m0, m3 + movu m1, [bufq+xq*2] + paddw m0, m1 + pminsw m0, m14 + pmaxsw m0, m7 + cmp xd, 72-40*%2 + je .end + movu [bufq+xq*2], m0 + add xd, 8 + jmp .x_loop_ar0 + + ; last 6/4 pixels +.end: +%if %2 +%if cpuflag(sse4) + pblendw m0, m1, 11000000b +%else + pand m1, m12 + pandn m2, m12, m0 + por m0, m1, m2 +%endif + movu [bufq+xq*2], m0 +%else + movq [bufq+xq*2], m0 +%endif + + add bufq, 82*2 + add bufyq, 82*(2<<%3) + dec hd + jg .y_loop_ar0 +%if ARCH_X86_32 +%undef m12 +%undef m14 +%endif + RET + +.ar1: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x +%else +%assign stack_offset stack_offset_old +%xdefine rstk rsp +%assign stack_size_padded 0 + DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 + mov bufyq, r1m + mov uvd, r3m +%endif + imul uvd, 28 + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] +%if WIN64 + DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 +%if %2 + lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] +%else + lea bufq, [r0-2*(82*69+3)] +%endif +%else +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 +%else + DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 +%define hd dword r1m +%define mind dword r3m +%define maxd dword r4m +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif +%endif +%if ARCH_X86_64 + mov shiftd, [r2+FGData.ar_coeff_shift] +%else + mov shiftd, [r3+FGData.ar_coeff_shift] +%endif + pxor m5, m5 + pcmpgtb m5, m4 + punpcklbw m4, m5 ; cf0-4 in words + pshuflw m4, m4, q2100 + psrldq m4, 2 ; cf0-3,4 in words + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + movd m3, [base+round_vals+shiftq*2-12] ; rnd + pxor m6, m6 + punpcklwd m3, m6 +%if %2 + SPLATW m6, [base+hmul_bits+2+%3*2] +%endif + SPLATD m3, m3 + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 + sar maxd, 1 +%if ARCH_X86_64 + mov mind, maxd + xor mind, -1 +%else + DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 + mov r2, maxd + xor r2, -1 + mov mind, r2 +%endif +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, word [bufq+xq*2-2] +.x_loop_ar1: + movu m0, [bufq+xq*2-82*2-2] ; top/left +%if %2 + movu m7, [bufyq+xq*4] +%if %3 + movu m1, [bufyq+xq*4+82*2] + phaddw m7, m1 +%else + phaddw m7, m7 +%endif +%else + movq m7, [bufyq+xq*2] +%endif + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 +%if %2 +%if %3 + pshufd m2, m7, q3232 + paddw m7, m2 +%endif + pmulhrsw m7, m6 +%endif + punpcklwd m1, m7 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 + paddd m0, m3 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, word [bufq+xq*2] + add val3d, val0d + cmp val3d, maxd + cmovg val3d, maxd + cmp val3d, mind + cmovl val3d, mind + mov word [bufq+xq*2], val3w + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar1 +%if ARCH_X86_32 +%undef maxd +%undef mind +%undef hd +%endif + RET + +.ar2: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift + ALLOC_STACK -16*8 + mov bufyq, r1m + mov uvd, r3m +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m5, bdmaxd ; max_grain +%else + SPLATW m5, r4m + psraw m5, 1 +%endif + pcmpeqw m6, m6 +%if !cpuflag(sse4) + pcmpeqw m7, m7 + psrldq m7, 14 + pslldq m7, 2 + pxor m7, m6 +%endif + pxor m6, m5 ; min_grain +%if %2 && cpuflag(sse4) + SPLATW m7, [base+hmul_bits+2+%3*2] +%endif + +%if ARCH_X86_64 + SWAP 5, 13 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m13 [rsp+5*16] +%define m14 [rsp+6*16] +%define m15 [rsp+7*16] + mova m13, m5 + mova m14, m6 + mova m15, m7 +%endif + + ; coef values + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pinsrw m2, [base+round_vals-12+shiftq*2], 5 + + pshufd m6, m0, q0000 + pshufd m7, m0, q1111 + pshufd m1, m0, q3333 + pshufd m0, m0, q2222 + pshufd m3, m2, q1111 + pshufd m4, m2, q2222 + pshufd m2, m2, q0000 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 + SWAP 4, 12 +%else +%define m8 [rsp+0*16] +%define m9 [rsp+1*16] +%define m10 [rsp+2*16] +%define m11 [rsp+3*16] +%define m12 [rsp+4*16] + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 +%endif + +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, h, x +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] + movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] + psrldq m4, m0, 2 ; y=-2,x=[-1,+5] + psrldq m1, m0, 4 ; y=-2,x=[-0,+5] + psrldq m3, m0, 6 ; y=-2,x=[+1,+5] + psrldq m2, m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] + pmaddwd m0, m6 + pmaddwd m1, m7 + pmaddwd m2, m8 + paddd m0, m1 + paddd m0, m2 + psrldq m3, m5, 2 ; y=-1,x=[-1,+5] + psrldq m1, m5, 4 ; y=-1,x=[-0,+5] + psrldq m4, m5, 6 ; y=-1,x=[+1,+5] + psrldq m2, m5, 8 ; y=-1,x=[+2,+5] + punpcklwd m3, m1 + punpcklwd m4, m2 + pmaddwd m3, m9 + pmaddwd m4, m10 + paddd m3, m4 + paddd m0, m3 + + ; luma component & rounding +%if %2 + movu m1, [bufyq+xq*4] +%if %3 + movu m2, [bufyq+xq*4+82*2] + phaddw m1, m2 + pshufd m2, m1, q3232 + paddw m1, m2 +%else + phaddw m1, m1 +%endif +%if cpuflag(sse4) + pmulhrsw m1, m15 +%elif %3 + pmulhrsw m1, [base+pw_8192] +%else + pmulhrsw m1, [base+pw_16384] +%endif +%else + movq m1, [bufyq+xq*2] +%endif + punpcklwd m1, [base+pw_1] + pmaddwd m1, m12 + paddd m0, m1 + + movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] + pshufd m2, m1, q3321 + pxor m3, m3 + pcmpgtw m3, m2 + punpcklwd m2, m3 ; y=0,x=[0,3] in dword +.x_loop_ar2_inner: + pmaddwd m3, m1, m11 + paddd m3, m0 + psrldq m0, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + ; we do not need to packssdw since we only care about one value + paddd m3, m2 + packssdw m3, m3 + pminsw m3, m13 + pmaxsw m3, m14 + psrldq m1, 2 + pslldq m3, 2 + psrldq m2, 4 +%if cpuflag(sse4) + pblendw m1, m3, 00000010b +%else + pand m1, m15 + pandn m4, m15, m3 + por m1, m4 +%endif + ; overwrite previous pixel, should be ok + movd [bufq+xq*2-2], m1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar2 +%if ARCH_X86_32 +%undef m13 +%undef m14 +%undef m15 +%endif + RET + +.ar3: +%if ARCH_X86_64 + DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift +%if WIN64 + mov r6, rsp + and rsp, ~15 + sub rsp, 96 + %define tmp rsp +%else + %define tmp rsp+stack_offset-120 +%endif +%else + DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift +%assign stack_offset stack_offset_old + ALLOC_STACK -16*14 + mov bufyq, r1m + mov uvd, r3m + %define tmp rsp +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + SPLATW m4, [base+round_vals-12+shiftq*2] + pxor m5, m5 + pcmpgtw m5, m4 + punpcklwd m4, m5 +%if ARCH_X86_64 + sar bdmaxd, 1 + SPLATW m6, bdmaxd ; max_grain +%else + SPLATW m6, r4m + psraw m6, 1 +%endif + pcmpeqw m7, m7 +%if !cpuflag(sse4) + pcmpeqw m3, m3 + psrldq m3, 14 + pslldq m3, 4 + pxor m3, m7 +%endif + pxor m7, m6 ; min_grain +%if %2 && cpuflag(sse4) + SPLATW m3, [base+hmul_bits+2+%3*2] +%endif + +%if ARCH_X86_64 + SWAP 3, 11 + SWAP 4, 12 + SWAP 6, 14 + SWAP 7, 15 +%else +%define m11 [rsp+ 9*16] +%define m12 [rsp+10*16] +%define m14 [rsp+12*16] +%define m15 [rsp+13*16] + mova m11, m3 + mova m12, m4 + mova m14, m6 + mova m15, m7 +%endif + + ; cf from y=-3,x=-3 until y=-3,x=-2 + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pshufd m1, m0, q0000 + pshufd m3, m0, q1111 + pshufd m4, m0, q2222 + pshufd m0, m0, q3333 + pshufd m5, m2, q0000 + pshufd m6, m2, q1111 + mova [tmp+16*0], m1 + mova [tmp+16*1], m3 + mova [tmp+16*2], m4 + mova [tmp+16*3], m0 + mova [tmp+16*4], m5 + mova [tmp+16*5], m6 + pshufd m6, m2, q2222 + pshufd m7, m2, q3333 + + ; cf from y=-1,x=-1 to y=0,x=-1 + luma component + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] + pxor m1, m1 + pcmpgtb m1, m0 + punpckhbw m2, m0, m1 ; luma + punpcklbw m0, m1 + pshufd m3, m0, q3232 + psrldq m5, m0, 10 + ; y=0,x=[-3 to -1] + "1.0" for current pixel + pinsrw m5, [base+round_vals-10+shiftq*2], 3 + ; y=-1,x=[-1 to +2] + pshufd m1, m0, q0000 + pshufd m0, m0, q1111 + ; y=-1,x=+3 + luma + punpcklwd m3, m2 + pshufd m3, m3, q0000 + +%if ARCH_X86_64 + SWAP 1, 8 + SWAP 0, 9 + SWAP 3, 10 + SWAP 5, 13 + DEFINE_ARGS buf, bufy, fg_data, h, x +%else +%define m8 [rsp+ 6*16] +%define m9 [rsp+ 7*16] +%define m10 [rsp+ 8*16] +%define m13 [rsp+11*16] + mova m8, m1 + mova m9, m0 + mova m10, m3 + mova m13, m5 + DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x +%endif +%if %2 + sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) +%else + sub bufq, 2*(82*69+3) +%endif + add bufyq, 2*(79+82*3) + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + ; first line + movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] + movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] + palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] + palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] + punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] + + pmaddwd m0, [tmp+0*16] + pmaddwd m2, [tmp+1*16] + pmaddwd m3, [tmp+2*16] + paddd m0, m2 + paddd m0, m3 ; first 6 x of top y + + ; second line [m0/1 are busy] + movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] + movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] + punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] + palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] + palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] + punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] + punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] + shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] + pmaddwd m1, [tmp+3*16] + pmaddwd m4, [tmp+4*16] + pmaddwd m3, [tmp+5*16] + pmaddwd m5, m6 + paddd m1, m4 + paddd m3, m5 + paddd m0, m1 + paddd m0, m3 ; top 2 lines + + ; third line [m0 is busy] & luma + round + movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] + movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] +%if %2 + movu m5, [bufyq+xq*4] +%if %3 + movu m4, [bufyq+xq*4+82*2] + phaddw m5, m4 +%else + phaddw m5, m5 +%endif +%else + movq m5, [bufyq+xq*2] +%endif + palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] + palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] +%if %3 + pshufd m4, m5, q3232 + paddw m5, m4 +%endif +%if %2 +%if cpuflag(sse4) + pmulhrsw m5, m11 +%elif %3 + pmulhrsw m5, [base+pw_8192] +%else + pmulhrsw m5, [base+pw_16384] +%endif +%endif + punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] + punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] + shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] + punpcklwd m2, m5 + pmaddwd m1, m7 + pmaddwd m3, m8 + pmaddwd m4, m9 + pmaddwd m2, m10 + paddd m1, m3 + paddd m4, m2 + paddd m0, m12 ; += round + paddd m1, m4 + paddd m0, m1 + + movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pmaddwd m2, m1, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + packssdw m2, m2 + pminsw m2, m14 + pmaxsw m2, m15 + pslldq m2, 4 + psrldq m1, 2 +%if cpuflag(sse4) + pblendw m1, m2, 00000100b +%else + pand m1, m11 + pandn m3, m11, m2 + por m1, m3 +%endif + ; overwrite previous pixels, should be ok + movq [bufq+xq*2-4], m1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82*2 + add bufyq, 82*2<<%3 + dec hd + jg .y_loop_ar3 +%if WIN64 + mov rsp, r6 +%elif ARCH_X86_32 +%undef m8 +%undef m9 +%undef m10 +%undef m11 +%undef m12 +%undef m13 +%undef m14 +%undef m15 +%endif + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ + dst, src, scaling, unused1, fg_data, picptr, unused2 + ; copy stack arguments to new position post-alignment, so that we + ; don't have to keep the old stack location in a separate register + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov r5, r8m + +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] + + mov r0m, r0 + mov r2m, r1 + mov r4m, r2 + mov r6m, r3 + mov r7m, r4 + mov r8m, r5 +%else +cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ + dst, src, scaling, unused1, fg_data, picptr, unused2 +%endif + mov srcq, srcm + mov scalingq, r5m + mov fg_dataq, r3m +%if STACK_ALIGNMENT < mmsize + mov r6, r9m + +%define r9m [rsp+8*mmsize+ 4*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] + + mov r9m, r6 +%endif + LEA r5, $$ +%define base r5-$$ + mov r5m, picptrq +%else +cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut + lea r8, [pb_mask] +%define base r8-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + SPLATW m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] +%if ARCH_X86_32 + DECLARE_REG_TMP 0, 3 +%else + DECLARE_REG_TMP 9, 10 +%endif + mov t0d, r9m ; bdmax + sar t0d, 11 ; is_12bpc + inc t0d + mov t1d, r6d + imul t1d, t0d + dec t0d + SPLATW m5, [base+min+t1*2] + lea t0d, [t0d*3] + lea t0d, [r6d*2+t0d] + SPLATW m4, [base+max+t0*2] + SPLATW m2, r9m + + pcmpeqw m1, m1 + psraw m7, m2, 1 ; max_grain + pxor m1, m7 ; min_grain + SPLATD m6, [base+pd_16] + + SCRATCH 1, 9, 0 + SCRATCH 2, 10, 1 + SCRATCH 3, 11, 2 + SCRATCH 4, 12, 3 + SCRATCH 5, 13, 4 + SCRATCH 6, 14, 5 + SCRATCH 7, 15, 6 + + mova m6, [base+pw_27_17_17_27] ; for horizontal filter + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 + DECLARE_REG_TMP 0 +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see + DECLARE_REG_TMP 7 +%endif + + mov sbyd, r8m + movzx t0d, byte [fg_dataq+FGData.overlap_flag] + test t0d, t0d + jz .no_vertical_overlap + test sbyd, sbyd + jnz .vertical_overlap +.no_vertical_overlap: + mov dword r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak +%endif + + lea src_bakq, [srcq+wq*2] + mov r9mp, src_bakq + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r4m, wq +%endif + +.loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak +%endif + +.loop_x_odd: + movzx hd, word r7m + mov grain_lutq, grain_lutmp +.loop_y: + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 + vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 +%else + vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 + vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 +%endif + REPX {psrlw x, 8}, m2, m3 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movu m5, [grain_lutq+offxyq*2+16] + + ; noise = round2(scaling[src] * grain, scaling_shift) + REPX {pmullw x, m11}, m2, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp ; src += stride + add grain_lutq, 82*2 + dec hd + jg .loop_y + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + btc dword r8m, 2 + jc .next_blk + add offxyd, 16 + test dword r8m, 2 + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r12d, 16 ; top_offxy += 16 +%endif + jmp .loop_x_odd_v_overlap + +.next_blk: + test dword r8m, 1 + jz .loop_x + + ; r8m = sbym + test dword r8m, 2 + jnz .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: +%if ARCH_X86_32 + add offxyd, 16 + mov [rsp+8*mmsize+0*gprsize], offxyd + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + mov seed, r3m +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy +%endif + + mov hd, dword r7m + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m5, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] + movd m4, [grain_lutq+r5*2] +%else + movd m4, [grain_lutq+left_offxyq*2] +%endif + punpcklwd m4, m5 + pmaddwd m4, m6 + paddd m4, m14 + psrad m4, 5 + packssdw m4, m4 + pminsw m4, m15 + pmaxsw m4, m9 + shufps m4, m5, q3210 + + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 + vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 + vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 +%endif + REPX {psrlw x, 8}, m2, m3 + + ; noise = round2(scaling[src] * grain, scaling_shift) + movu m5, [grain_lutq+offxyq*2+16] + REPX {pmullw x, m11}, m2, m3 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hd + jg .loop_y_h_overlap + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + or dword r8m, 4 + add offxyd, 16 + + ; r8m = sbym + test dword r8m, 2 + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r12d, 16 ; top_offxy += 16 +%endif + jmp .loop_x_odd_v_overlap + +.end: + RET + +.vertical_overlap: + or t0d, 2 + mov r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ + sby, see +%endif + + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul t0d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add t0d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and t0d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, t0d +%if ARCH_X86_32 + xor sbyd, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, src_bak +%endif + + lea src_bakq, [srcq+wq*2] + mov r9mp, src_bakq + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r4m, wq +%endif + +.loop_x_v_overlap: +%if ARCH_X86_32 + mov r5, r5m + SPLATD m7, [base+pw_27_17_17_27] + mov seed, r3m +%else + SPLATD m7, [pw_27_17_17_27] +%endif + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, unused, top_offxy + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, unused, top_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +.loop_x_odd_v_overlap: +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)] + mov hd, dword r7m + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+1*gprsize] + movu m2, [grain_lutq+r5*2] +%else + movu m2, [grain_lutq+top_offxyq*2] +%endif + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + REPX {pmaddwd x, m7}, m4, m2 + REPX {paddd x, m14}, m4, m2 + REPX {psrad x, 5}, m4, m2 + packssdw m2, m4 + pminsw m2, m15 + pmaxsw m2, m9 + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m3, [grain_lutq+r5*2+16] +%else + movu m3, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m5, m3, m4 + punpcklwd m3, m4 + REPX {pmaddwd x, m7}, m5, m3 + REPX {paddd x, m14}, m5, m3 + REPX {psrad x, 5}, m5, m3 + packssdw m3, m5 + pminsw m3, m15 + pmaxsw m3, m9 + + ; src + pand m0, m10, [srcq+ 0] ; m0-1: src as word + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + ; noise = round2(scaling[src] * grain, scaling_shift) +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 +%endif + psrlw m4, 8 + pmullw m4, m11 + pmulhrsw m4, m2 +%if ARCH_X86_32 + vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 +%else + vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 +%endif + psrlw m5, 8 + pmullw m5, m11 + pmulhrsw m5, m3 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + btc dword r8m, 2 + jc .next_blk_v +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + jmp .loop_x_odd_v_overlap + +.next_blk_v: + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +.loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r0, [rsp+8*mmsize+1*gprsize] + add r3, 16 + add r0, 16 + mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy + mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy + + mov seed, r3m + xor r0, r0 +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)] + + movzx hd, word r7m + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m2, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy + mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy + movu m4, [grain_lutq+r0*2] + movd m5, [grain_lutq+r5*2] + mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy + movd m3, [grain_lutq+r5*2] +%else + movu m4, [grain_lutq+top_offxyq*2] + movd m5, [grain_lutq+left_offxyq*2] + movd m3, [grain_lutq+topleft_offxyq*2] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklwd m5, m2 + punpcklwd m3, m4 + REPX {pmaddwd x, m6}, m5, m3 + REPX {paddd x, m14}, m5, m3 + REPX {psrad x, 5}, m5, m3 + packssdw m5, m3 + pminsw m5, m15 + pmaxsw m5, m9 + shufps m3, m5, m2, q3210 + shufps m5, m4, q3232 + ; followed by v interpolation (top | cur -> cur) + movu m0, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m1, [grain_lutq+r0*2+16] +%else + movu m1, [grain_lutq+top_offxyq*2+16] +%endif + punpcklwd m2, m5, m3 + punpckhwd m5, m3 + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + REPX {pmaddwd x, m7}, m2, m5, m3, m1 + REPX {paddd x, m14}, m2, m5, m3, m1 + REPX {psrad x, 5}, m2, m5, m3, m1 + packssdw m2, m5 + packssdw m3, m1 + REPX {pminsw x, m15}, m2, m3 + REPX {pmaxsw x, m9}, m2, m3 + + ; src + pand m0, m10, [srcq+ 0] + pand m1, m10, [srcq+16] ; m0-1: src as word + + ; scaling[src] + ; noise = round2(scaling[src] * grain, scaling_shift) +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 +%else + vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 +%endif + psrlw m4, 8 + pmullw m4, m11 + pmulhrsw m2, m4 +%if ARCH_X86_32 + vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 +%else + vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 +%endif + psrlw m5, 8 + pmullw m5, m11 + pmulhrsw m3, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+srcq+ 0], m0 + mova [dstq+srcq+16], m1 + + add srcq, r2mp + add grain_lutq, 82*2 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] + xor hd, 0x10000 + test hd, 0x10000 + jnz .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: + or dword r8m, 4 +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov r5, r5m + add offxyd, 16 + add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 + mov srcq, r9mp + add srcq, r4mp + add srcq, r4mp +%else + add offxyd, 16 + add top_offxyd, 16 + mov src_bakq, r9mp + lea srcq, [src_bakq+wq*2] +%endif + jmp .loop_x_odd_v_overlap + +.end_hv: + RET +%if ARCH_X86_32 + DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +%endif + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +INIT_XMM ssse3 +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ + tmp, src, scaling, h, fg_data, picptr, unused + mov r0, r0m + mov r1, r1m + mov r2, r2m + mov r4, r3m + mov r3, r4m + mov r5, r5m +%define r0m [rsp+8*mmsize+ 3*gprsize] +%define r1m [rsp+8*mmsize+ 4*gprsize] +%define r2m [rsp+8*mmsize+ 5*gprsize] +%define r3m [rsp+8*mmsize+ 6*gprsize] +%define r4m [rsp+8*mmsize+ 7*gprsize] +%define r5m [rsp+8*mmsize+ 8*gprsize] + mov r0m, r0 + mov r2m, r2 + mov r4m, r3 + mov r5m, r5 + + mov r0, r6m + mov r2, r7m + mov r3, r8m + mov r5, r9m +%define r6m [rsp+8*mmsize+ 9*gprsize] +%define r7m [rsp+8*mmsize+10*gprsize] +%define r8m [rsp+8*mmsize+11*gprsize] +%define r9m [rsp+8*mmsize+12*gprsize] + mov r6m, r0 + mov r7m, r2 + mov r8m, r3 + mov r9m, r5 + + mov r2, r10m + mov r3, r11m + mov r5, r12m + mov r0, r13m +%define r10m [rsp+8*mmsize+13*gprsize] +%define r11m [rsp+8*mmsize+14*gprsize] +%define r12m [rsp+8*mmsize+15*gprsize] + mov r10m, r2 + mov r11m, r3 + mov r12m, r5 + + SPLATW m2, r13m +%else +cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ + tmp, src, scaling, h, fg_data, picptr, unused + mov srcq, srcm + mov fg_dataq, r3m +%endif + LEA r5, $$ +%define base r5-$$ + + DECLARE_REG_TMP 0, 2, 3 +%else +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r8-pb_mask + lea r8, [pb_mask] + + DECLARE_REG_TMP 9, 10, 11 +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + SPLATW m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] +%if STACK_ALIGNMENT >= mmsize + mov t0d, r13m ; bdmax +%endif + sar t0d, 11 ; is_12bpc + inc t0d + mov t1d, r6d + imul t1d, t0d + dec t0d + SPLATW m5, [base+min+t1*2] + lea t1d, [t0d*3] + mov t2d, r12m + inc t2d + imul r6d, t2d + add t1d, r6d + SPLATW m4, [base+max+t1*2] +%if STACK_ALIGNMENT >= mmsize + SPLATW m2, r13m +%endif + + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + +%define mzero m7 + +%if %3 + SPLATD m2, [base+pw_23_22] +%endif + +%if ARCH_X86_32 + mov scalingq, r5m + mov r5m, r5 +%else + mov r13mp, strideq +%endif + + pcmpeqw m0, m0 + psraw m1, m10, 1 + pxor m0, m1 + + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap + + DECLARE_REG_TMP 0 +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap + + DECLARE_REG_TMP 9 +%endif + +%if %1 + mov r6d, r11m + SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] + SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] + punpcklwd m6, m1, m0 + SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] + SPLATD m7, [base+pw_4+t0*4] + pmullw m5, m7 +%else + SPLATD m6, [base+pd_16] +%if %2 + mova m5, [base+pw_23_22] +%else + mova m5, [base+pw_27_17_17_27] +%endif +%endif + + SCRATCH 6, 14, 6 + SCRATCH 5, 15, 7 + +%if ARCH_X86_32 + DECLARE_REG_TMP 0 +%else + DECLARE_REG_TMP 7 +%endif + + mov sbyd, r8m + mov t0d, [fg_dataq+FGData.overlap_flag] + test t0d, t0d + jz %%no_vertical_overlap + test sbyd, sbyd + jnz %%vertical_overlap + +%%no_vertical_overlap: + mov r8m, t0d +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, luma + + mov dstq, r0mp + mov lumaq, r9mp + mov wq, r4m + lea r3, [srcq+wq*2] + mov r1mp, r3 + lea r3, [dstq+wq*2] + mov r11mp, r3 + lea r3, [lumaq+wq*(2<<%2)] + mov r12mp, r3 +%if %3 + shl r10mp, 1 +%endif +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused2, unused3, see, unused4, unused5, unused6, luma, lstride + + mov lstrideq, r10mp +%if %3 + add lstrideq, lstrideq +%endif + mov lumaq, r9mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 +%endif + neg wq +%if ARCH_X86_32 + mov r4mp, wq +%endif + +%%loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, unused2, unused3, luma, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, unused2, unused3, luma, lstride +%endif + +%if %2 == 0 +%%loop_x_odd: +%endif + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y: + ; src + mova m0, [srcq] + mova m1, [srcq+16] ; m0-1: src as word + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9m +%endif + mova m4, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m4, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9m, lumaq +%endif +%if %2 + pavgw m4, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, mzero}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m4, m6 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 + vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 + vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m3, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2] + movu m6, [grain_lutq+offxyq*2+16] + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m3, m5 + pmulhrsw m4, m3 + pmulhrsw m6, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m4 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hd + jg %%loop_y + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma + + mov wq, r4mp +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0m, dstq + mov r9m, lumaq + mov r4m, wq +%endif +%if %2 == 0 + btc dword r8m, 2 + jc %%next_blk + add offxyd, 16 + test dword r8m, 2 + jz %%loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + jmp %%loop_x_odd_v_overlap +%%next_blk: +%endif + test dword r8m, 1 + je %%loop_x + + ; r8m = sbym + test dword r8m, 2 + jnz %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: +%if ARCH_X86_32 + add offxyd, 16 + mov [rsp+8*mmsize+0*gprsize], offxyd + + DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, luma, lstride + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, luma, lstride +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + mov lumaq, r9m +%endif + mova m4, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m4, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9m, lumaq +%endif +%if %2 + pavgw m4, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m3, m4, m0 + punpcklwd m4, m0 + punpckhwd m5, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m3, m4, m5, m6 + REPX {psrad x, 6}, m3, m4, m5, m6 + packssdw m4, m3 + packssdw m6, m5 + REPX {paddw x, m15}, m4, m6 + REPX {pmaxsw x, mzero}, m4, m6 + REPX {pminsw x, m10}, m4, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m4, m6 +%endif + + ; grain = grain_lut[offy+y][offx+x] + movu m7, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] + movd m5, [grain_lutq+r5*2] +%else + movd m5, [grain_lutq+left_offxyq*2+ 0] +%endif + punpcklwd m5, m7 ; {left0, cur0} +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %2 + pmaddwd m5, [PIC_ptr(pw_23_22)] +%else + pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] +%endif + paddd m5, [PIC_ptr(pd_16)] +%else + pmaddwd m5, m15 + paddd m5, m14 +%endif + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, m9 + shufps m5, m7, q3210 + movu m3, [grain_lutq+offxyq*2+16] + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 + vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 + vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m7, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m4 + pmulhrsw m5, m7 + pmulhrsw m3, m4 + + ; dst = clip_pixel(src, noise) + paddw m0, m5 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hd + jg %%loop_y_h_overlap + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + mov wq, r4mp +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0mp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif + +%if %2 + ; r8m = sbym + test dword r8m, 2 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap +%else + or dword r8m, 4 + add offxyd, 16 + + ; r8m = sbym + test dword r8m, 2 + jz %%loop_x_odd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxy += 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%end: + RET + +%%vertical_overlap: + or t0d, 2 + mov r8m, t0d + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, unused1, unused2, unused3, lstride +%endif + + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul t0d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add t0d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and t0d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, t0d +%if ARCH_X86_32 + xor sbyd, seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, luma + + mov r3m, seed + mov dstq, r0mp + mov lumaq, r9mp + mov wq, r4m + lea r3, [srcq+wq*2] + mov r1mp, r3 + lea r3, [dstq+wq*2] + mov r11mp, r3 + lea r3, [lumaq+wq*(2<<%2)] + mov r12mp, r3 +%if %3 + shl r10mp, 1 +%endif +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused1, unused2, see, unused3, unused4, unused5, luma, lstride + + mov lstrideq, r10mp +%if %3 + add lstrideq, lstrideq +%endif + mov lumaq, r9mp + lea r10, [srcq+wq*2] + lea r11, [dstq+wq*2] + lea r12, [lumaq+wq*(2<<%2)] + mov r10mp, r10 + mov r11mp, r11 + mov r12mp, r12 +%endif + neg wq +%if ARCH_X86_32 + mov r4m, wq +%endif + +%%loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m + xor t0d, t0d +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, unused1, top_offxy, unused2, luma, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, unused1, top_offxy, unused2, luma, lstride +%endif + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if %2 == 0 +%%loop_x_odd_v_overlap: +%endif +%if %3 == 0 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)] +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_v_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy + movu m5, [grain_lutq+r0*2] +%else + movu m5, [grain_lutq+top_offxyq*2] +%endif + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif + REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m3, m5, m7 + pmaxsw m3, m8 + pminsw m3, m9 + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m5, [grain_lutq+r0*2+16] +%else + movu m5, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m7, m5, m4 + punpcklwd m5, m4 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 +%else + REPX {paddd x, m14}, m7, m5 +%endif + REPX {psrad x, 5}, m7, m5 + packssdw m4, m5, m7 + pmaxsw m4, m8 + pminsw m4, m9 + + ; src + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9mp +%endif + mova m5, [lumaq+ 0] + mova m6, [lumaq+(16<<%2)] +%if %2 + phaddw m5, [lumaq+16] + phaddw m6, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif +%if %2 + pavgw m5, mzero + pavgw m6, mzero +%endif + +%if %1 + punpckhwd m7, m5, m0 + punpcklwd m5, m0 + REPX {pmaddwd x, m14}, m7, m5 + REPX {psrad x, 6}, m7, m5 + packssdw m5, m7 + punpckhwd m7, m6, m1 + punpcklwd m6, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m7, m6 + REPX {psrad x, 6}, m7, m6 + packssdw m6, m7 + pxor mzero, mzero + REPX {paddw x, m15}, m5, m6 + REPX {pmaxsw x, mzero}, m5, m6 + REPX {pminsw x, m10}, m5, m6 ; clip_pixel() +%else + REPX {pand x, m10}, m5, m6 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 + vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 +%else + vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 + vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m5 + pmulhrsw m3, m7 + pmulhrsw m4, m5 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + + dec hw + jle %%end_y_v_overlap +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 +%if %3 + jmp %%loop_y +%else + btc hd, 16 + jc %%loop_y +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] + jmp %%loop_y_v_overlap +%endif + +%%end_y_v_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov r0mp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif + +%if %2 + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%else + btc dword r8m, 2 + jc %%loop_x_hv_overlap + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut + + mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy + add offxyd, 16 + add t0d, 16 + mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd + mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd + + DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m + xor t0d, t0d +%else + ; we assume from the block above that bits 8-15 of r7d are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp t0b ; parity of top_seed + shr seed, 16 + shl t0d, 16 + test seeb, seeh + setp t0b ; parity of cur_seed + or r6d, 0x00010001 + xor t0d, r6d + mov seed, t0d + ror seed, 1 ; updated (cur_seed << 16) | top_seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride +%endif + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+8*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +%if %3 == 0 +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)] +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy + mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy + movd m5, [grain_lutq+r5*2] +%else + movd m5, [grain_lutq+left_offxyq*2] +%endif + movu m7, [grain_lutq+offxyq*2] +%if ARCH_X86_32 + mov r5, [rsp+8*mmsize+2*gprsize] + movu m4, [grain_lutq+r0*2] +%if %2 + pinsrw m5, [grain_lutq+r5*2], 2 +%else + movd m3, [grain_lutq+r5*2] +%endif +%else + movu m4, [grain_lutq+top_offxyq*2] +%if %2 + pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } +%else + movd m3, [grain_lutq+topleft_offxyq*2] +%endif +%endif +%if %2 == 0 + punpckldq m5, m3 +%endif + punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } + punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } +%if %1 +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %2 + movddup m0, [PIC_ptr(pw_23_22)] +%else + movddup m0, [PIC_ptr(pw_27_17_17_27)] +%endif +%else + pshufd m0, m15, q1010 +%endif + pmaddwd m5, m0 +%if %1 + paddd m5, [PIC_ptr(pd_16)] +%else + paddd m5, m14 +%endif + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, m9 + shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 + shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter + shufps m5, m4, q3231 ; top0-7 post-h_filter + + punpckhwd m7, m5, m3 + punpcklwd m5, m3 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m7, m5 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 +%else + REPX {paddd x, m14}, m5, m7 +%endif + REPX {psrad x, 5}, m5, m7 + packssdw m3, m5, m7 + pmaxsw m3, m8 + pminsw m3, m9 + + ; right half + movu m4, [grain_lutq+offxyq*2+16] +%if ARCH_X86_32 + movu m0, [grain_lutq+r0*2+16] +%else + movu m0, [grain_lutq+top_offxyq*2+16] +%endif + punpckhwd m1, m0, m4 + punpcklwd m0, m4 ; {top/cur interleaved} + REPX {pmaddwd x, m2}, m1, m0 +%if %1 + REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 +%else + REPX {paddd x, m14}, m1, m0 +%endif + REPX {psrad x, 5}, m1, m0 + packssdw m4, m0, m1 + pmaxsw m4, m8 + pminsw m4, m9 + + ; src + mova m0, [srcq] + mova m1, [srcq+16] + + ; luma_src + pxor mzero, mzero +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut + + mov lumaq, r9mp +%endif + mova m6, [lumaq+ 0] + mova m5, [lumaq+(16<<%2)] +%if %2 + phaddw m6, [lumaq+16] + phaddw m5, [lumaq+48] +%endif +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif +%if %2 + pavgw m6, mzero + pavgw m5, mzero +%endif + +%if %1 + punpckhwd m7, m6, m0 + punpcklwd m6, m0 + REPX {pmaddwd x, m14}, m7, m6 + REPX {psrad x, 6}, m7, m6 + packssdw m6, m7 + punpckhwd m7, m5, m1 + punpcklwd m5, m1 ; { luma, chroma } + REPX {pmaddwd x, m14}, m7, m5 + REPX {psrad x, 6}, m7, m5 + packssdw m5, m7 + pxor mzero, mzero + REPX {paddw x, m15}, m6, m5 + REPX {pmaxsw x, mzero}, m6, m5 + REPX {pminsw x, m10}, m6, m5 ; clip_pixel() +%else + REPX {pand x, m10}, m6, m5 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 + vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 +%else +%if %3 == 0 + ; register shortage :) + push r12 +%endif + vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 + vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 +%if %3 == 0 + pop r12 +%endif +%endif + REPX {psrlw x, 8}, m7, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + REPX {pmullw x, m11}, m7, m6 + pmulhrsw m3, m7 + pmulhrsw m4, m6 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + movifnidn dstq, dstmp + mova [dstq+ 0], m0 + mova [dstq+16], m1 + +%if ARCH_X86_32 + add srcq, r2mp + add dstq, r2mp + mov dstmp, dstq +%else + add srcq, r13mp + add dstq, r13mp + add lumaq, lstrideq +%endif + add grain_lutq, 82*2 + dec hw +%if %3 + jg %%loop_y_h_overlap +%else + jle %%end_y_hv_overlap + btc hd, 16 + jc %%loop_y_h_overlap +%if ARCH_X86_32 + mov r5, r5m +%endif + SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] + jmp %%loop_y_hv_overlap +%%end_y_hv_overlap: +%endif +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp +%else + mov srcq, r10mp +%endif + mov dstq, r11mp + mov lumaq, r12mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] +%if ARCH_X86_32 + mov dstmp, dstq + mov r9mp, lumaq + mov r4m, wq +%endif +%if %2 + jmp %%loop_x_hv_overlap +%else + or dword r8m, 4 + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxy += 16 +%endif + jmp %%loop_x_odd_v_overlap +%endif + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 diff --git a/third_party/dav1d/src/x86/filmgrain_avx2.asm b/third_party/dav1d/src/x86/filmgrain_avx2.asm new file mode 100644 index 0000000000..55445cf593 --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain_avx2.asm @@ -0,0 +1,2107 @@ +; Copyright © 2019-2022, VideoLAN and dav1d authors +; Copyright © 2019-2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 +gen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +gen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +gen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +gen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +; note: the order of (some of) the following constants matter +pb_27_17: times 2 db 27, 17 +byte_blend: db 0, 0, 0, -1 +pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 +pb_17_27: times 2 db 17, 27 +pb_1: times 4 db 1 +pb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32 +next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +fg_min: times 4 db 0 + times 4 db 16 +fg_max: times 4 db 255 + times 4 db 240 + times 4 db 235 +pd_m65536: dd -65536 +pw_8: times 2 dw 8 +pw_1024: times 2 dw 1024 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512 +pw_1: dw 1 + +%macro JMP_TABLE 2-* + %1_8bpc_%2_table: + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 + +SECTION .text + +INIT_YMM avx2 +cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data +%define base r4-generate_grain_y_8bpc_avx2_table + lea r4, [generate_grain_y_8bpc_avx2_table] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + mov r6d, [fg_dataq+FGData.grain_scale_shift] + movq xm1, [base+next_upperbit_mask] + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movq xm4, [base+mul_bits] + movq xm5, [base+hmul_bits] + mov r7, -73*82 + mova xm6, [base+pb_mask] + sub bufq, r7 + vpbroadcastw xm7, [base+round+r6*2] + lea r6, [gaussian_sequence] + movsxd r5, [r4+r5*4] +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pmulhuw xm0, xm5 + pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm2, xm0 ; aggregate each bit into next seed's high bit + por xm3, xm2 ; 4 next output seeds + pshuflw xm0, xm3, q3333 + psrlw xm3, 5 + pand xm2, xm0, xm1 + movq r2, xm3 + psrlw xm3, xm2, 10 + por xm2, xm3 + pmullw xm2, xm4 + pmulhuw xm0, xm5 + movzx r3d, r2w + pshufb xm3, xm6, xm2 + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm0, xm2 + movd xm2, [r6+r3*2] + rorx r3, r2, 32 + por xm3, xm0 + shr r2d, 16 + pinsrw xm2, [r6+r2*2], 1 + pshuflw xm0, xm3, q3333 + movzx r2d, r3w + psrlw xm3, 5 + pinsrw xm2, [r6+r2*2], 2 + shr r3d, 16 + movq r2, xm3 + pinsrw xm2, [r6+r3*2], 3 + movzx r3d, r2w + pinsrw xm2, [r6+r3*2], 4 + rorx r3, r2, 32 + shr r2d, 16 + pinsrw xm2, [r6+r2*2], 5 + movzx r2d, r3w + pinsrw xm2, [r6+r2*2], 6 + shr r3d, 16 + pinsrw xm2, [r6+r3*2], 7 + pmulhrsw xm2, xm7 + packsswb xm2, xm2 + movq [bufq+r7], xm2 + add r7, 8 + jl .loop + + ; auto-regression code + add r5, r4 + jmp r5 + +.ar1: + DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd xm5, [fg_dataq+FGData.ar_coeffs_y] + mova xm2, [base+gen_shufC] + DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 + pinsrb xm5, [base+pb_1], 3 + vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd + pmovsxbw xm5, xm5 + pshufd xm4, xm5, q0000 + pshufd xm5, xm5, q1111 + sub bufq, 82*73-(82*3+79) + mov hd, 70 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -76 + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + pmovsxbw xm1, [bufq+xq-82-3] + pshufb xm0, xm1, xm2 + punpckhwd xm1, xm3 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + movsx val0d, byte [bufq+xq] + sarx val3d, val3d, shiftd + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xb, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 +.x_loop_ar1_end: + add bufq, 82 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: +%if WIN64 + ; xmm6 and xmm7 already saved + %assign xmm_regs_used 16 + %assign stack_size_padded 168 + SUB rsp, stack_size_padded + movaps [rsp+16*2], xmm8 + movaps [rsp+16*3], xmm9 + movaps [rsp+16*4], xmm10 + movaps [rsp+16*5], xmm11 + movaps [rsp+16*6], xmm12 + movaps [rsp+16*7], xmm13 + movaps [rsp+16*8], xmm14 + movaps [rsp+16*9], xmm15 +%endif + DEFINE_ARGS buf, fg_data, h, x + mov r6d, [fg_dataq+FGData.ar_coeff_shift] + pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 + movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 + vpbroadcastd xm10, [base+round_vals-14+r6*2] + movd xm11, [base+byte_blend+1] + pmovsxbw xm9, xm9 + pshufd xm4, xm7, q0000 + mova xm12, [base+gen_shufA] + pshufd xm5, xm7, q3333 + mova xm13, [base+gen_shufB] + pshufd xm6, xm7, q1111 + mova xm14, [base+gen_shufC] + pshufd xm7, xm7, q2222 + mova xm15, [base+gen_shufD] + pshufd xm8, xm9, q0000 + psrld xm10, 16 + pshufd xm9, xm9, q1111 + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar2: + mov xq, -76 +.x_loop_ar2: + pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pshufb xm2, xm0, xm12 + pmaddwd xm2, xm4 + pshufb xm3, xm1, xm13 + pmaddwd xm3, xm5 + paddd xm2, xm3 + pshufb xm3, xm0, xm14 + pmaddwd xm3, xm6 + punpckhqdq xm0, xm0 + punpcklwd xm0, xm1 + pmaddwd xm0, xm7 + pshufb xm1, xm15 + pmaddwd xm1, xm8 + paddd xm2, xm10 + paddd xm2, xm3 + paddd xm0, xm1 + paddd xm2, xm0 + movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pmovsxbw xm1, xm0 + pmaddwd xm3, xm9, xm1 + psrldq xm1, 4 ; y=0,x=0 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + paddw xm3, xm1 + packsswb xm3, xm3 + pextrb [bufq+xq], xm3, 0 + pslldq xm3, 2 + vpblendvb xm0, xm3, xm11 + psrldq xm0, 1 + inc xq + jz .x_loop_ar2_end + test xb, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 +.x_loop_ar2_end: + add bufq, 82 + dec hd + jg .y_loop_ar2 + RET + +INIT_YMM avx2 +.ar3: +%if WIN64 + ; xmm6 and xmm7 already saved + %assign stack_offset 16 + ALLOC_STACK 16*14 + %assign stack_size stack_size - 16*4 + %assign xmm_regs_used 12 + movaps [rsp+16*12], xmm8 + movaps [rsp+16*13], xmm9 + movaps [rsp+16*14], xmm10 + movaps [rsp+16*15], xmm11 +%else + ALLOC_STACK 16*12 +%endif + mov r6d, [fg_dataq+FGData.ar_coeff_shift] + movd xm11, [base+byte_blend] + pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pshufd m0, m1, q0000 + mova [rsp+16* 0], m0 + pshufd m0, m1, q1111 + mova [rsp+16* 2], m0 + pshufd m0, m1, q2222 + mova [rsp+16* 4], m0 + pshufd m1, m1, q3333 + mova [rsp+16* 6], m1 + pshufd xm0, xm2, q0000 + mova [rsp+16* 8], xm0 + pshufd xm0, xm2, q1111 + mova [rsp+16* 9], xm0 + psrldq xm7, xm2, 10 + mova m8, [base+gen_shufA] + pinsrw xm2, [base+pw_1], 5 + mova m9, [base+gen_shufC] + pshufd xm2, xm2, q2222 + movu m10, [base+gen_shufE] + vpbroadcastw xm6, [base+round_vals-12+r6*2] + pinsrw xm7, [base+round_vals+r6*2-10], 3 + mova [rsp+16*10], xm2 + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 +.x_loop_ar3: + movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12] + movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + punpcklbw m3, m5, m5 + punpckhwd m5, m4 + psraw m3, 8 + punpcklbw m5, m5 + psraw m5, 8 + punpcklbw xm4, xm4 + psraw xm4, 8 + pshufb m0, m3, m8 + pmaddwd m0, [rsp+16*0] + pshufb m1, m3, m9 + pmaddwd m1, [rsp+16*2] + shufps m2, m3, m5, q1032 + paddd m0, m1 + pshufb m1, m2, m8 + vperm2i128 m3, m4, 0x21 + pmaddwd m1, [rsp+16*4] + shufps xm2, xm3, q1021 + vpblendd m2, m3, 0xf0 + pshufb m2, m10 + paddd m0, m1 + pmaddwd m2, [rsp+16*6] + pshufb xm1, xm4, xm9 + pmaddwd xm1, [rsp+16*8] + shufps xm4, xm5, q1132 + paddd m0, m2 + pshufb xm2, xm4, xm8 + pshufd xm4, xm4, q2121 + pmaddwd xm2, [rsp+16*9] + punpcklwd xm4, xm6 + pmaddwd xm4, [rsp+16*10] + vextracti128 xm3, m0, 1 + paddd xm0, xm1 + movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] + paddd xm2, xm4 + paddd xm0, xm2 + paddd xm0, xm3 +.x_loop_ar3_inner: + pmovsxbw xm2, xm1 + pmaddwd xm2, xm7 + pshufd xm3, xm2, q1111 + paddd xm2, xm0 ; add top + paddd xm2, xm3 ; left+cur + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + packsswb xm2, xm2 + pextrb [bufq+xq], xm2, 0 + pslldq xm2, 3 + vpblendvb xm1, xm2, xm11 + psrldq xm1, 1 + inc xq + jz .x_loop_ar3_end + test xb, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 +.x_loop_ar3_end: + add bufq, 82 + dec hd + jg .y_loop_ar3 + RET + +%macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y +INIT_XMM avx2 +cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv +%define base r4-generate_grain_uv_%1_8bpc_avx2_table + lea r4, [generate_grain_uv_%1_8bpc_avx2_table] + vpbroadcastw xm0, [fg_dataq+FGData.seed] + mov r6d, [fg_dataq+FGData.grain_scale_shift] + movq xm1, [base+next_upperbit_mask] + movq xm4, [base+mul_bits] + movq xm5, [base+hmul_bits] + mova xm6, [base+pb_mask] + vpbroadcastw xm7, [base+round+r6*2] + vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] + pxor xm0, xm2 + lea r6, [gaussian_sequence] +%if %2 + mov r7d, 73-35*%3 + add bufq, 44 +.loop_y: + mov r5, -44 +%else + mov r5, -73*82 + sub bufq, r5 +%endif +.loop: + pand xm2, xm0, xm1 + psrlw xm3, xm2, 10 + por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw xm2, xm4 ; bits 0x0f00 are set + pmulhuw xm0, xm5 + pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds + psllq xm2, xm3, 30 + por xm2, xm3 + psllq xm3, xm2, 15 + por xm2, xm0 ; aggregate each bit into next seed's high bit + por xm2, xm3 ; 4 next output seeds + pshuflw xm0, xm2, q3333 + psrlw xm2, 5 + movq r8, xm2 + movzx r9d, r8w + movd xm2, [r6+r9*2] + rorx r9, r8, 32 + shr r8d, 16 + pinsrw xm2, [r6+r8*2], 1 + movzx r8d, r9w + pinsrw xm2, [r6+r8*2], 2 + shr r9d, 16 + pinsrw xm2, [r6+r9*2], 3 + pmulhrsw xm2, xm7 + packsswb xm2, xm2 + movd [bufq+r5], xm2 + add r5, 4 + jl .loop +%if %2 + add bufq, 82 + dec r7d + jg .loop_y +%endif + + ; auto-regression code + movsxd r6, [fg_dataq+FGData.ar_coeff_lag] + movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4] + add r6, r4 + jmp r6 + +INIT_YMM avx2 +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq] + movd xm3, [base+hmul_bits+shiftq*2] + DEFINE_ARGS buf, bufy, h + pmovsxbw xm2, xm2 +%if %2 + vpbroadcastd m7, [base+pb_1] + vpbroadcastw m6, [base+hmul_bits+2+%3*2] +%endif + vpbroadcastw m2, xm2 + vpbroadcastw m3, xm3 + pxor m12, m12 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif + add bufyq, 3+82*3 + mov hd, 70-35*%3 +.y_loop_ar0: +%if %2 + ; first 32 pixels + movu xm4, [bufyq] + vinserti128 m4, [bufyq+32], 1 +%if %3 + movu xm0, [bufyq+82] + vinserti128 m0, [bufyq+82+32], 1 +%endif + movu xm5, [bufyq+16] + vinserti128 m5, [bufyq+48], 1 +%if %3 + movu xm1, [bufyq+82+16] + vinserti128 m1, [bufyq+82+48], 1 +%endif + pmaddubsw m4, m7, m4 +%if %3 + pmaddubsw m0, m7, m0 +%endif + pmaddubsw m5, m7, m5 +%if %3 + pmaddubsw m1, m7, m1 + paddw m4, m0 + paddw m5, m1 +%endif + pmulhrsw m4, m6 + pmulhrsw m5, m6 +%else + xor r3d, r3d + ; first 32x2 pixels +.x_loop_ar0: + movu m4, [bufyq+r3] + pcmpgtb m0, m12, m4 + punpckhbw m5, m4, m0 + punpcklbw m4, m0 +%endif + pmullw m4, m2 + pmullw m5, m2 + pmulhrsw m4, m3 + pmulhrsw m5, m3 +%if %2 + movu m1, [bufq] +%else + movu m1, [bufq+r3] +%endif + pcmpgtb m8, m12, m1 + punpcklbw m0, m1, m8 + punpckhbw m1, m8 + paddw m0, m4 + paddw m1, m5 + packsswb m0, m1 +%if %2 + movu [bufq], m0 +%else + movu [bufq+r3], m0 + add r3d, 32 + cmp r3d, 64 + jl .x_loop_ar0 +%endif + + ; last 6/12 pixels + movu xm4, [bufyq+32*2] +%if %2 +%if %3 + movu xm5, [bufyq+32*2+82] +%endif + pmaddubsw xm4, xm7, xm4 +%if %3 + pmaddubsw xm5, xm7, xm5 + paddw xm4, xm5 +%endif + movq xm0, [bufq+32] + pmulhrsw xm4, xm6 + pmullw xm4, xm2 + pmulhrsw xm4, xm3 + pcmpgtb xm5, xm12, xm0 + punpcklbw xm5, xm0, xm5 + paddw xm4, xm5 + packsswb xm4, xm4 + pblendw xm0, xm4, xm0, 1000b + movq [bufq+32], xm0 +%else + movu xm0, [bufq+64] + pcmpgtb xm1, xm12, xm4 + punpckhbw xm5, xm4, xm1 + punpcklbw xm4, xm1 + pmullw xm5, xm2 + pmullw xm4, xm2 + vpblendd xm1, xm3, xm12, 0x0c + pmulhrsw xm5, xm1 + pmulhrsw xm4, xm3 + pcmpgtb xm1, xm12, xm0 + punpckhbw xm8, xm0, xm1 + punpcklbw xm0, xm1 + paddw xm5, xm8 + paddw xm0, xm4 + packsswb xm0, xm5 + movu [bufq+64], xm0 +%endif + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar0 + RET + +INIT_XMM avx2 +.ar1: + DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] + pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 + DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift + pmovsxbw xm4, xm4 + pshufd xm5, xm4, q1111 + pshufd xm4, xm4, q0000 + pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 + vpbroadcastd xm7, [base+pb_1] + vpbroadcastw xm6, [base+hmul_bits+2+%3*2] +%endif + vpbroadcastd xm3, xm3 +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + pmovsxbw xm0, [bufq+xq-82-1] ; top/left +%if %2 + movq xm8, [bufyq+xq*2] +%if %3 + movq xm9, [bufyq+xq*2+82] +%endif +%endif + psrldq xm2, xm0, 2 ; top + psrldq xm1, xm0, 4 ; top/right +%if %2 + pmaddubsw xm8, xm7, xm8 +%if %3 + pmaddubsw xm9, xm7, xm9 + paddw xm8, xm9 +%endif + pmulhrsw xm8, xm6 +%else + pmovsxbw xm8, [bufyq+xq] +%endif + punpcklwd xm0, xm2 + punpcklwd xm1, xm8 + pmaddwd xm0, xm4 + pmaddwd xm1, xm5 + paddd xm0, xm1 + paddd xm0, xm3 +.x_loop_ar1_inner: + movd val0d, xm0 + psrldq xm0, 4 + imul val3d, cf3d + add val3d, val0d + sarx val3d, val3d, shiftd + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar1 + RET + +.ar2: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + vpbroadcastw xm13, [base+round_vals-12+shiftq*2] + pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 + pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 + pinsrw xm0, [base+pw_1], 5 +%if %2 + vpbroadcastw xm12, [base+hmul_bits+2+%3*2] + vpbroadcastd xm11, [base+pb_1] +%endif + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + pshufd xm4, xm7, q0000 + pshufd xm5, xm7, q3333 + pshufd xm6, xm7, q1111 + pshufd xm7, xm7, q2222 + pshufd xm8, xm0, q0000 + pshufd xm9, xm0, q1111 + pshufd xm10, xm0, q2222 +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pshufb xm2, xm0, [base+gen_shufA] + pmaddwd xm2, xm4 + pshufb xm3, xm1, [base+gen_shufB] + pmaddwd xm3, xm5 + paddd xm2, xm3 + pshufb xm3, xm0, [base+gen_shufC] + pmaddwd xm3, xm6 + punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5] + punpcklwd xm0, xm1 + pmaddwd xm0, xm7 + pshufb xm1, [gen_shufD] + pmaddwd xm1, xm8 + paddd xm2, xm3 + paddd xm0, xm1 + paddd xm2, xm0 + +%if %2 + movq xm0, [bufyq+xq*2] +%if %3 + movq xm3, [bufyq+xq*2+82] +%endif + pmaddubsw xm0, xm11, xm0 +%if %3 + pmaddubsw xm3, xm11, xm3 + paddw xm0, xm3 +%endif + pmulhrsw xm0, xm12 +%else + pmovsxbw xm0, [bufyq+xq] +%endif + punpcklwd xm0, xm13 + pmaddwd xm0, xm10 + paddd xm2, xm0 + + movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pmovsxbw xm0, xm0 + pmaddwd xm3, xm0, xm9 + psrldq xm0, 2 + paddd xm3, xm2 + psrldq xm2, 4 ; shift top to next pixel + psrad xm3, [fg_dataq+FGData.ar_coeff_shift] + pslldq xm3, 2 + paddw xm3, xm0 + pblendw xm0, xm3, 00000010b + packsswb xm0, xm0 + pextrb [bufq+xq], xm0, 1 + inc xq + jz .x_loop_ar2_end + test xb, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar2 + RET + +INIT_YMM avx2 +.ar3: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 + pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 + vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] + movd xm13, [base+round_vals-10+shiftq*2] + vpbroadcastd xm14, [base+round_vals-14+shiftq*2] + pshufd m6, m0, q0000 + pshufd m7, m0, q1111 + pshufd m8, m0, q2222 + pshufd m9, m0, q3333 + pshufd xm10, xm1, q0000 + pshufd xm11, xm1, q1111 + pshufhw xm12, xm1, q0000 + psraw xm2, 8 + palignr xm13, xm1, 10 + punpckhwd xm12, xm2 ; interleave luma cf + psrld xm14, 16 + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + vpbroadcastw xm15, [base+hmul_bits+2+%3*2] + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) +.x_loop_ar3: + vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12 + palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12] + vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + vpblendd m3, m1, 0x0f + pxor m0, m0 + pcmpgtb m2, m0, m3 + pcmpgtb m0, m4 + punpcklbw m1, m3, m2 + punpckhbw m3, m2 + punpcklbw m2, m4, m0 + punpckhbw xm4, xm0 + pshufb m0, m1, [base+gen_shufA] + pmaddwd m0, m6 + pshufb m5, m1, [base+gen_shufC] + pmaddwd m5, m7 + shufps m1, m3, q1032 + paddd m0, m5 + pshufb m5, m1, [base+gen_shufA] + pmaddwd m5, m8 + shufps xm1, xm3, q2121 + vpblendd m1, m2, 0xf0 + pshufb m1, [base+gen_shufE] + pmaddwd m1, m9 + paddd m0, m5 + pshufb xm3, xm2, [base+gen_shufC] + paddd m0, m1 + pmaddwd xm3, xm10 + palignr xm1, xm4, xm2, 2 + punpckhwd xm1, xm2, xm1 + pmaddwd xm1, xm11 + palignr xm4, xm2, 12 + paddd xm3, xm1 +%if %2 + vpbroadcastd xm5, [base+pb_1] + movq xm1, [bufyq+xq*2] + pmaddubsw xm1, xm5, xm1 +%if %3 + movq xm2, [bufyq+xq*2+82] + pmaddubsw xm5, xm2 + paddw xm1, xm5 +%endif + pmulhrsw xm1, xm15 +%else + pmovsxbw xm1, [bufyq+xq] +%endif + punpcklwd xm4, xm1 + pmaddwd xm4, xm12 + movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] + vextracti128 xm2, m0, 1 + paddd xm0, xm14 + paddd xm3, xm4 + paddd xm0, xm3 + paddd xm0, xm2 +.x_loop_ar3_inner: + pmovsxbw xm1, xm1 + pmaddwd xm2, xm13, xm1 + pshuflw xm3, xm2, q1032 + paddd xm2, xm0 ; add top + paddd xm2, xm3 ; left+cur + psrldq xm0, 4 + psrad xm2, [fg_dataq+FGData.ar_coeff_shift] + psrldq xm1, 2 + ; don't packssdw, we only care about one value + punpckldq xm2, xm2 + pblendw xm1, xm2, 0100b + packsswb xm1, xm1 + pextrb [bufq+xq], xm1, 2 + inc xq + jz .x_loop_ar3_end + test xb, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 +.x_loop_ar3_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +INIT_YMM avx2 +cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, see, overlap +%define base r9-pd_m65536 + lea r9, [pd_m65536] + mov r6d, [fg_dataq+FGData.scaling_shift] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov sbyd, sbym + mov overlapd, [fg_dataq+FGData.overlap_flag] + vpbroadcastd m8, [base+pd_m65536] + vpbroadcastw m9, [base+mul_bits+r6*2-14] + vpbroadcastd m10, [base+fg_min+r7*4] + vpbroadcastd m11, [base+fg_max+r7*8] + vpbroadcastd m12, [base+pw_1024] + movq xm13, [base+pb_27_17_17_27] + test sbyd, sbyd + setnz r7b + pxor m7, m7 + test r7b, overlapb + jnz .vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + +.loop_x: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, overlap + + mov hd, hm + mov grain_lutq, grain_lutmp +.loop_y: + ; src + mova m2, [srcq] + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + + ; scaling[src] + pandn m4, m8, m0 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, m0, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m1 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m5, [grain_lutq+offxyq] + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hd + jg .loop_y + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test overlapd, overlapd + jz .loop_x + + ; r8m = sbym + cmp dword r8m, 0 + jne .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy + + lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + ; src + mova m2, [srcq] + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + + ; scaling[src] + pandn m4, m8, m0 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, m0, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m1 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m5, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm5 + pmaddubsw xm4, xm13, xm4 + pmulhrsw xm4, xm12 + packsswb xm4, xm4 + vpblendd m4, m5, 0xfe + punpckhbw m5, m7 + punpcklbw m4, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hd + jg .loop_y_h_overlap + + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + + ; r8m = sbym + cmp dword r8m, 0 + jne .loop_x_hv_overlap + jmp .loop_x_h_overlap + +.vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + unused, sby, see, overlap + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + +.loop_x_v_overlap: + vpbroadcastd m14, [pb_27_17] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +.loop_y_v_overlap: + ; src + mova m2, [srcq] + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + + ; scaling[src] + pandn m4, m8, m0 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, m0, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m1 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m6, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] + punpcklbw m5, m4, m6 + punpckhbw m4, m6 + pmaddubsw m5, m14, m5 + pmaddubsw m4, m14, m4 + pmulhrsw m5, m12 + pmulhrsw m4, m12 + packsswb m5, m4 + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hb + jz .end_y_v_overlap + vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + add hd, 0x80000000 + jnc .loop_y_v_overlap + jmp .loop_y +.end_y_v_overlap: + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +.loop_x_hv_overlap: + vpbroadcastd m14, [pb_27_17] + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyd, [top_offxyq+32] + lea left_offxyd, [offyq+32] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +.loop_y_hv_overlap: + ; src + mova m2, [srcq] + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + + ; scaling[src] + pandn m4, m8, m0 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, m0, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m1 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + pblendw m2, m4, 0xaa + psrld m4, m1, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m4-2], m6 + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] + movu m6, [grain_lutq+offxyq] + movd xm7, [grain_lutq+left_offxyq] + movu m4, [grain_lutq+top_offxyq] + movd xm5, [grain_lutq+topleft_offxyq] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw xm7, xm6 + punpcklbw xm5, xm4 + pmaddubsw xm7, xm13, xm7 + pmaddubsw xm5, xm13, xm5 + pmulhrsw xm7, xm12 + pmulhrsw xm5, xm12 + packsswb xm7, xm7 + packsswb xm5, xm5 + vpblendd m7, m6, 0xfe + vpblendd m5, m4, 0xfe + ; followed by v interpolation (top | cur -> cur) + punpckhbw m4, m6 + punpcklbw m5, m7 + pmaddubsw m4, m14, m4 + pmaddubsw m5, m14, m5 + pmulhrsw m4, m12 + pmulhrsw m5, m12 + pxor m7, m7 + packsswb m5, m4 + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 + mova [dstq+srcq], m0 + + add srcq, strideq + add grain_lutq, 82 + dec hb + jz .end_y_hv_overlap + vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + add hd, 0x80000000 + jnc .loop_y_hv_overlap + jmp .loop_y_h_overlap +.end_y_hv_overlap: + add wq, 32 + lea srcq, [src_bakq+wq] + jl .loop_x_hv_overlap +.end: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, overlap, uv_pl, is_id +%define base r11-pd_m65536 + lea r11, [pd_m65536] + mov r6d, [fg_dataq+FGData.scaling_shift] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, is_idm + mov sbyd, sbym + mov overlapd, [fg_dataq+FGData.overlap_flag] + vpbroadcastd m8, [base+pd_m65536] + vpbroadcastw m9, [base+mul_bits+r6*2-14] + vpbroadcastd m10, [base+fg_min+r7*4] + shlx r7d, r7d, r9d + vpbroadcastd m11, [base+fg_max+r7*4] + vpbroadcastd m12, [base+pw_1024] + pxor m7, m7 + test sbyd, sbyd + setnz r7b + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, sby, see, overlap, uv_pl +%if %1 + mov r6d, uv_plm + vpbroadcastd m0, [base+pw_8] + vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] + vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] + pshufb m14, m0 ; uv_luma_mult, uv_mult +%elif %2 + vpbroadcastq m15, [base+pb_23_22] +%else + vpbroadcastq xm15, [base+pb_27_17_17_27] +%endif +%if %3 + vpbroadcastw m13, [base+pb_23_22] +%elif %2 + pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27 +%endif + test r7b, overlapb + jnz %%vertical_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused2, unused3, see, overlap, unused4, unused5, lstride + + mov lumaq, r9mp + lea r12, [srcq+wq] + lea r13, [dstq+wq] + lea r14, [lumaq+wq*(1+%2)] + mov r11mp, r12 + mov r12mp, r13 + mov lstrideq, r10mp + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, unused1, unused2, lstride + + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, unused1, unused2, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: + ; src +%if %2 + mova xm3, [lumaq+lstrideq*0+ 0] + vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1 + vpbroadcastd m2, [pb_1] + mova xm0, [lumaq+lstrideq*0+16] + vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 + mova xm1, [srcq] + vinserti128 m1, [srcq+strideq], 1 + pmaddubsw m3, m2 + pmaddubsw m0, m2 + pavgw m3, m7 + pavgw m0, m7 +%else + mova m2, [lumaq] + mova m1, [srcq] +%endif +%if %1 +%if %2 + packuswb m2, m3, m0 ; luma +%endif + punpckhbw m3, m2, m1 + punpcklbw m2, m1 ; { luma, chroma } + pmaddubsw m3, m14 + pmaddubsw m2, m14 + psraw m3, 6 + psraw m2, 6 + paddw m3, m15 + paddw m2, m15 + packuswb m2, m3 ; pack+unpack = clip +%endif +%if %1 || %2 == 0 + punpcklbw m3, m2, m7 + punpckhbw m0, m2, m7 +%endif + + ; scaling[luma_src] + pandn m4, m8, m3 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m0 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + psrld m0, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m0-2], m6 + pblendw m2, m4, 0xaa + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movu xm5, [grain_lutq+offxyq+ 0] + vinserti128 m5, [grain_lutq+offxyq+82], 1 +%else + movu m5, [grain_lutq+offxyq] +%endif + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; unpack chroma_source + punpcklbw m0, m1, m7 + punpckhbw m1, m7 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 + jg %%loop_y + + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + test overlapd, overlapd + jz %%loop_x + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: + ; src +%if %2 + mova xm3, [lumaq+lstrideq*0+ 0] + vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 + vpbroadcastd m2, [pb_1] + mova xm0, [lumaq+lstrideq*0+16] + vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 + mova xm1, [srcq] + vinserti128 m1, [srcq+strideq], 1 + pmaddubsw m3, m2 + pmaddubsw m0, m2 + pavgw m3, m7 + pavgw m0, m7 +%else + mova m2, [lumaq] + mova m1, [srcq] +%endif +%if %1 +%if %2 + packuswb m2, m3, m0 ; luma +%endif + punpckhbw m3, m2, m1 + punpcklbw m2, m1 ; { luma, chroma } + pmaddubsw m3, m14 + pmaddubsw m2, m14 + psraw m3, 6 + psraw m2, 6 + paddw m3, m15 + paddw m2, m15 + packuswb m2, m3 ; pack+unpack = clip +%endif +%if %1 || %2 == 0 + punpcklbw m3, m2, m7 + punpckhbw m0, m2, m7 +%endif + + ; scaling[luma_src] + pandn m4, m8, m3 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m0 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + psrld m0, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m0-2], m6 + pblendw m2, m4, 0xaa + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movu xm5, [grain_lutq+offxyq+ 0] + vinserti128 m5, [grain_lutq+offxyq+82], 1 + movd xm4, [grain_lutq+left_offxyq+ 0] + vinserti128 m4, [grain_lutq+left_offxyq+82], 1 + punpcklbw m4, m5 +%if %1 + vpbroadcastq m0, [pb_23_22] + pmaddubsw m4, m0, m4 +%else + pmaddubsw m4, m15, m4 +%endif + pmulhrsw m4, m12 + packsswb m4, m4 + vpblendd m4, m5, 0xee +%else + movu m5, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm5 +%if %1 + movq xm0, [pb_27_17_17_27] + pmaddubsw xm4, xm0, xm4 +%else + pmaddubsw xm4, xm15, xm4 +%endif + pmulhrsw xm4, xm12 + packsswb xm4, xm4 + vpblendd m4, m5, 0xfe +%endif + punpckhbw m5, m7 + punpcklbw m4, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; unpack chroma_source + punpcklbw m0, m1, m7 + punpckhbw m1, m7 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(1+%2) + sub hb, 1+%2 + jg %%loop_y_h_overlap + + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + + ; r8m = sbym + cmp dword r8m, 0 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap + +%%vertical_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ + sby, see, overlap, unused1, unused2, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused1, unused2, see, overlap, unused3, unused4, lstride + + mov lumaq, r9mp + lea r12, [srcq+wq] + lea r13, [dstq+wq] + lea r14, [lumaq+wq*(1+%2)] + mov r11mp, r12 + mov r12mp, r13 + mov lstrideq, r10mp + neg wq + +%%loop_x_v_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy, unused, lstride + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy, unused, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +%if %2 == 0 + vpbroadcastd m13, [pb_27_17] +%endif +%%loop_y_v_overlap: + ; src +%if %2 + mova xm3, [lumaq+lstrideq*0+ 0] + vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 + vpbroadcastd m2, [pb_1] + mova xm0, [lumaq+lstrideq*0+16] + vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 + mova xm1, [srcq] + vinserti128 m1, [srcq+strideq], 1 + pmaddubsw m3, m2 + pmaddubsw m0, m2 + pavgw m3, m7 + pavgw m0, m7 +%else + mova m2, [lumaq] + mova m1, [srcq] +%endif +%if %1 +%if %2 + packuswb m2, m3, m0 ; luma +%endif + punpckhbw m3, m2, m1 + punpcklbw m2, m1 ; { luma, chroma } + pmaddubsw m3, m14 + pmaddubsw m2, m14 + psraw m3, 6 + psraw m2, 6 + paddw m3, m15 + paddw m2, m15 + packuswb m2, m3 ; pack+unpack = clip +%endif +%if %1 || %2 == 0 + punpcklbw m3, m2, m7 + punpckhbw m0, m2, m7 +%endif + + ; scaling[luma_src] + pandn m4, m8, m3 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m0 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + psrld m0, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m0-2], m6 + pblendw m2, m4, 0xaa + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] +%if %3 == 0 +%if %2 + movu xm0, [grain_lutq+offxyq] + vinserti128 m0, [grain_lutq+offxyq+82], 1 + movu xm4, [grain_lutq+top_offxyq] + vinserti128 m4, [grain_lutq+top_offxyq+82], 1 +%else + movu m0, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] +%endif + punpcklbw m5, m4, m0 + punpckhbw m4, m0 + pmaddubsw m5, m13, m5 + pmaddubsw m4, m13, m4 + pmulhrsw m5, m12 + pmulhrsw m4, m12 + packsswb m5, m4 +%else + movq xm4, [grain_lutq+offxyq] + vinserti128 m4, [grain_lutq+offxyq+8], 1 + movq xm5, [grain_lutq+top_offxyq] + vinserti128 m5, [grain_lutq+top_offxyq+8], 1 + punpcklbw m5, m4 + pmaddubsw m5, m13, m5 + pmulhrsw m5, m12 + vextracti128 xm4, m5, 1 + packsswb xm5, xm4 + ; only interpolate first line, insert second line unmodified + vinserti128 m5, [grain_lutq+offxyq+82], 1 +%endif + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; unpack chroma_source + punpcklbw m0, m1, m7 + punpckhbw m1, m7 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + + sub hb, 1+%2 + jle %%end_y_v_overlap +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 +%if %2 == 0 + vpbroadcastd m13, [pb_17_27] + add hd, 0x80000000 + jnc %%loop_y_v_overlap +%endif + jmp %%loop_y + +%%end_y_v_overlap: + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap + +%%loop_x_hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride + + lea topleft_offxyd, [top_offxyq+(32>>%2)] + lea left_offxyd, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 +%if %2 == 0 + vpbroadcastd m13, [pb_27_17] +%endif +%%loop_y_hv_overlap: + ; src +%if %2 + mova xm3, [lumaq+lstrideq*0+ 0] + vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 + vpbroadcastd m2, [pb_1] + mova xm0, [lumaq+lstrideq*0+16] + vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 + mova xm1, [srcq] + vinserti128 m1, [srcq+strideq], 1 + pmaddubsw m3, m2 + pmaddubsw m0, m2 + pavgw m3, m7 + pavgw m0, m7 +%else + mova m2, [lumaq] + mova m1, [srcq] +%endif +%if %1 +%if %2 + packuswb m2, m3, m0 ; luma +%endif + punpckhbw m3, m2, m1 + punpcklbw m2, m1 ; { luma, chroma } + pmaddubsw m3, m14 + pmaddubsw m2, m14 + psraw m3, 6 + psraw m2, 6 + paddw m3, m15 + paddw m2, m15 + packuswb m2, m3 ; pack+unpack = clip +%endif +%if %1 || %2 == 0 + punpcklbw m3, m2, m7 + punpckhbw m0, m2, m7 +%endif + + ; scaling[luma_src] + pandn m4, m8, m3 + mova m6, m8 + vpgatherdd m2, [scalingq+m4-0], m8 + psrld m3, 16 + mova m8, m6 + vpgatherdd m4, [scalingq+m3-2], m6 + pandn m5, m8, m0 + mova m6, m8 + vpgatherdd m3, [scalingq+m5-0], m8 + psrld m0, 16 + mova m8, m6 + vpgatherdd m5, [scalingq+m0-2], m6 + pblendw m2, m4, 0xaa + pblendw m3, m5, 0xaa + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movu xm4, [grain_lutq+offxyq] + vinserti128 m4, [grain_lutq+offxyq+82], 1 + movd xm0, [grain_lutq+left_offxyq] + vinserti128 m0, [grain_lutq+left_offxyq+82], 1 + movd xm6, [grain_lutq+topleft_offxyq] +%if %3 + movq xm5, [grain_lutq+top_offxyq] + vinserti128 m5, [grain_lutq+top_offxyq+8], 1 +%else + vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1 + movu xm5, [grain_lutq+top_offxyq] + vinserti128 m5, [grain_lutq+top_offxyq+82], 1 +%endif + + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m0, m4 +%if %3 + punpcklbw xm6, xm5 +%else + punpcklbw m6, m5 +%endif + punpcklqdq m0, m6 +%if %1 + vpbroadcastq m6, [pb_23_22] + pmaddubsw m0, m6, m0 +%else + pmaddubsw m0, m15, m0 +%endif + pmulhrsw m0, m12 + packsswb m0, m0 + vpblendd m4, m0, 0x11 +%if %3 + pshuflw xm0, xm0, q1032 + vpblendd m5, m0, 0x01 +%else + pshuflw m0, m0, q1032 + vpblendd m5, m0, 0x11 +%endif +%else + movu m4, [grain_lutq+offxyq] + movd xm0, [grain_lutq+left_offxyq] + movu m5, [grain_lutq+top_offxyq] + movd xm6, [grain_lutq+topleft_offxyq] + punpcklbw xm0, xm4 + punpcklbw xm6, xm5 + punpcklqdq xm0, xm6 +%if %1 + vpbroadcastq xm6, [pb_27_17_17_27] + pmaddubsw xm0, xm6, xm0 +%else + pmaddubsw xm0, xm15, xm0 +%endif + pmulhrsw xm0, xm12 + packsswb xm0, xm0 + vpblendd m4, m0, 0x01 + pshuflw xm0, xm0, q1032 + vpblendd m5, m0, 0x01 +%endif + + ; followed by v interpolation (top | cur -> cur) +%if %3 + vpermq m0, m4, q3120 + punpcklbw m5, m0 + pmaddubsw m5, m13, m5 + pmulhrsw m5, m12 + vextracti128 xm0, m5, 1 + packsswb xm5, xm0 + vpblendd m5, m4, 0xf0 +%else + punpckhbw m0, m5, m4 + punpcklbw m5, m4 + pmaddubsw m4, m13, m0 + pmaddubsw m5, m13, m5 + pmulhrsw m4, m12 + pmulhrsw m5, m12 + packsswb m5, m4 +%endif + punpcklbw m4, m5, m7 + punpckhbw m5, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmaddubsw m2, m4 + pmaddubsw m3, m5 + pmulhrsw m2, m9 + pmulhrsw m3, m9 + + ; unpack chroma source + punpcklbw m0, m1, m7 + punpckhbw m1, m7 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + pmaxub m0, m10 + pminub m0, m11 +%if %2 + mova [dstq], xm0 + vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif + +%if %2 + lea srcq, [srcq+strideq*2] + lea dstq, [dstq+strideq*2] + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 +%if %2 + jg %%loop_y_h_overlap +%else + je %%end_y_hv_overlap + vpbroadcastd m13, [pb_17_27] + add hd, 0x80000000 + jnc %%loop_y_hv_overlap + jmp %%loop_y_h_overlap +%endif + +%%end_y_hv_overlap: + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r14+wq*(1+%2)] + add srcq, wq + add dstq, wq + jmp %%loop_x_hv_overlap +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET +%endmacro + +GEN_GRAIN_UV_FN 420, 1, 1 +FGUV_FN 420, 1, 1 +GEN_GRAIN_UV_FN 422, 1, 0 +FGUV_FN 422, 1, 0 +GEN_GRAIN_UV_FN 444, 0, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/filmgrain_avx512.asm b/third_party/dav1d/src/x86/filmgrain_avx512.asm new file mode 100644 index 0000000000..317ec118b3 --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain_avx512.asm @@ -0,0 +1,813 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 + db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 + db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 +pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 + db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 +interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7 +pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 +pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32 +pb_27_17: times 2 db 27, 17 +pb_23_22: times 2 db 23, 22 +pw_8: times 2 dw 8 +pw_1024: times 2 dw 1024 +pb_17_27: times 2 db 17, 27 +fg_max: times 4 db 255 + times 4 db 240 + times 4 db 235 +fg_min: times 4 db 0 + times 4 db 16 +noise_rnd: times 2 dw 128 + times 2 dw 64 + times 2 dw 32 + times 2 dw 16 + +SECTION .text + +INIT_ZMM avx512icl +cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, see, overlap +%define base r11-fg_min + lea r11, [fg_min] + mov r6d, [fg_dataq+FGData.scaling_shift] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov sbyd, sbym + mov overlapd, [fg_dataq+FGData.overlap_flag] + mov r12, 0x0000000f0000000f ; h_overlap mask + mova m0, [scalingq+64*0] + mova m1, [scalingq+64*1] + mova m2, [scalingq+64*2] + mova m3, [scalingq+64*3] + kmovq k1, r12 + vbroadcasti32x4 m4, [base+interleave_hl] + vpbroadcastd ym16, [base+pb_27_17] + vpbroadcastd m12, [base+pb_17_27] + vpbroadcastd m6, [base+noise_rnd+r6*4-32] + test sbyd, sbyd + setnz r6b + vpbroadcastd m7, [base+fg_min+r7*4] + vpbroadcastd m8, [base+fg_max+r7*8] + pxor m5, m5 + vpbroadcastd m9, [base+pw_1024] + vpbroadcastq m10, [base+pb_27_17_17_27] + vmovdqa64 m12{k1}, m16 + test r6b, overlapb + jnz .v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq +.loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offxd, [offyq+offxq*2+829] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, overlap + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y: + movu ym21, [grain_lutq+offxyq-82] + vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1 + call .add_noise + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test overlapd, overlapd + jz .loop_x + test sbyd, sbyd + jnz .hv_overlap + +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, left_offxy + + rorx offyd, seed, 8 + mov left_offxyd, offxd ; previous column's offy*stride + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offxd, [offyq+offxq*2+829] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + movu ym20, [grain_lutq+offxyq-82] + vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1 + movd xm19, [grain_lutq+left_offxyq-50] + vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2 + punpcklbw m19, m20 + pmaddubsw m19, m10, m19 + pmulhrsw m19, m9 + punpckhbw m21, m20, m5 + packsswb m20{k1}, m19, m19 + punpcklbw m20, m5, m20 + call .add_noise_h + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + test sbyd, sbyd + jnz .hv_overlap + jmp .loop_x_h_overlap + +.v_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \ + h, sby, see, overlap + + movzx r6d, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, r6d, 173 * 0x00010001 + imul r6d, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add r6d, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and r6d, 0xff00ff00 + xor seed, r7d + xor seed, r6d ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, overlap + + lea src_bakq, [srcq+wq] + neg wq + sub dstq, srcq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offxd, [offyq+offxq*2+0x10001*829+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, overlap, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + movu ym19, [grain_lutq+offxyq-82] + vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 + movu ym21, [grain_lutq+top_offxyq-82] + vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 + punpckhbw m20, m21, m19 + punpcklbw m21, m19 + call .add_noise_v + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump back + ; to .v_overlap, and instead always fall-through to h+v overlap +.hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ + h, sby, see, left_offxy, top_offxy, topleft_offxy + + mov topleft_offxyd, top_offxyd + rorx offyd, seed, 8 + mov left_offxyd, offxd + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offxd, [offyq+offxq*2+0x10001*829+32*82] + + DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ + h, sby, see, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + movu ym19, [grain_lutq+offxyq-82] + vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 + movd xm16, [grain_lutq+left_offxyq-50] + vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2 + movu ym21, [grain_lutq+top_offxyq-82] + vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 + movd xm17, [grain_lutq+topleft_offxyq-50] + vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2 + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m16, m19 + pmaddubsw m16, m10, m16 + punpcklbw m17, m21 + pmaddubsw m17, m10, m17 + punpckhbw m20, m21, m19 + pmulhrsw m16, m9 + pmulhrsw m17, m9 + packsswb m19{k1}, m16, m16 + packsswb m21{k1}, m17, m17 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m21, m19 + call .add_noise_v + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + lea srcq, [src_bakq+wq] + jl .hv_overlap +.end: + RET +ALIGN function_align +.add_noise_v: + pmaddubsw m20, m12, m20 + pmaddubsw m21, m12, m21 + pmulhrsw m20, m9 + pmulhrsw m21, m9 + packsswb m21, m20 +.add_noise: + punpcklbw m20, m5, m21 + punpckhbw m21, m5 +.add_noise_h: + mova ym18, [srcq+strideq*0] + vinserti32x8 m18, [srcq+strideq*1], 1 + mova m19, m0 + punpcklbw m16, m18, m5 + vpermt2b m19, m18, m1 ; scaling[ 0..127] + vpmovb2m k2, m18 + punpckhbw m17, m18, m5 + vpermi2b m18, m2, m3 ; scaling[128..255] + vmovdqu8 m19{k2}, m18 ; scaling[src] + pshufb m19, m4 + pmaddubsw m18, m19, m20 + pmaddubsw m19, m21 + add grain_lutq, 82*2 + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 + mova [dstq+srcq], ym16 + add srcq, strideq + vextracti32x8 [dstq+srcq], m16, 1 + add srcq, strideq + ret + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \ + scaling, grain_lut, h, sby, luma, \ + overlap, uv_pl, is_id, _, stride3 + lea r11, [fg_min] + mov r6d, [fg_dataq+FGData.scaling_shift] + mov r7d, [fg_dataq+FGData.clip_to_restricted_range] + mov r9d, is_idm + mov sbyd, sbym + mov overlapd, [fg_dataq+FGData.overlap_flag] +%if %2 + mov r12, 0x000f000f000f000f ; h_overlap mask + vpbroadcastq m10, [base+pb_23_22_0_32] + lea stride3q, [strideq*3] +%else + mov r12, 0x0000000f0000000f + vpbroadcastq m10, [base+pb_27_17_17_27] +%endif + mova m0, [scalingq+64*0] + mova m1, [scalingq+64*1] + mova m2, [scalingq+64*2] + mova m3, [scalingq+64*3] + kmovq k1, r12 + vbroadcasti32x4 m4, [base+interleave_hl] + vpbroadcastd m6, [base+noise_rnd+r6*4-32] + vpbroadcastd m7, [base+fg_min+r7*4] + shlx r7d, r7d, r9d + vpbroadcastd m8, [base+fg_max+r7*4] + test sbyd, sbyd + setnz r7b + vpbroadcastd m9, [base+pw_1024] + mova m11, [base+pb_even] + mova m12, [base+pb_odd] + pxor m5, m5 + mov r5, r10mp ; lstride + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ + h, sby, see, overlap, uv_pl, _, _, stride3 +%if %1 + mov r6d, uv_plm + vpbroadcastd m16, [base+pw_8] + vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4] + vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] + pshufb m14, m16 ; uv_luma_mult, uv_mult +%endif + test r7b, overlapb + jnz %%v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, overlap, _, _, _, stride3 + + mov lumaq, r9mp + lea r11, [srcq+wq] + lea r12, [dstq+wq] + lea r13, [lumaq+wq*(1+%2)] + mov r11mp, r11 + mov r12mp, r12 + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, overlap, _, _, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: +%if %2 + movu xm21, [grain_lutq+offxyq+82*0] + vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 + vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 +%else + movu ym21, [grain_lutq+offxyq+82*0] + vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 +%endif + call %%add_noise + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + test overlapd, overlapd + jz %%loop_x + cmp dword r8m, 0 ; sby + jne %%hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, left_offxy, _, _, _, stride3 + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, left_offxy, _, _, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: +%if %2 + movu xm20, [grain_lutq+offxyq +82*0] + movd xm19, [grain_lutq+left_offxyq+82*0] + vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1 + vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1 + vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2 + vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2 + vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3 + vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3 +%else + movu ym20, [grain_lutq+offxyq + 0] + movd xm19, [grain_lutq+left_offxyq+ 0] + vinserti32x8 m20, [grain_lutq+offxyq +82], 1 + vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2 +%endif + punpcklbw m19, m20 + pmaddubsw m19, m10, m19 + punpckhbw m21, m20, m5 + pmulhrsw m19, m9 + vpacksswb m20{k1}, m19, m19 + punpcklbw m20, m5, m20 + call %%add_noise_h + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + cmp dword r8m, 0 ; sby + jne %%hv_overlap + jmp %%loop_x_h_overlap + +%%v_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ + _, sby, see, overlap, _, _, _, stride3 + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + +%if %3 + vpbroadcastd m13, [base+pb_23_22] + kxnorw k3, k3, k3 ; v_overlap mask +%elif %2 + vbroadcasti32x8 m13, [base+pb_27_17] + kxnord k3, k3, k3 + pshufd m13, m13, q0000 ; 8x27_17, 8x17_27 +%else + vpbroadcastd ym16, [base+pb_27_17] + vpbroadcastd m13, [base+pb_17_27] + vmovdqa64 m13{k1}, m16 +%endif + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, overlap, top_offxy, _, _, stride3 + + mov lumaq, r9mp + lea r11, [srcq+wq] + lea r12, [dstq+wq] + lea r13, [lumaq+wq*(1<<%2)] + mov r11mp, r11 + mov r12mp, r12 + neg wq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0x000f000f + and offxd, 0x000f000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, overlap, top_offxy, _, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + +%if %3 + movu xm18, [grain_lutq+offxyq+82*0] + movu xm20, [grain_lutq+top_offxyq+82*0] + ; only interpolate first line, insert remaining line unmodified + vbroadcasti128 ym21, [grain_lutq+offxyq+82*1] + vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 + punpcklbw xm19, xm20, xm18 + punpckhbw xm20, xm18 +%elif %2 + movu xm18, [grain_lutq+offxyq+82*0] + vinserti128 ym18, [grain_lutq+offxyq+82*1], 1 + movu xm20, [grain_lutq+top_offxyq+82*0] + vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1 + vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2] + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 + punpcklbw ym19, ym20, ym18 + punpckhbw ym20, ym18 +%else + movu ym21, [grain_lutq+offxyq+82*0] + vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 + movu ym20, [grain_lutq+top_offxyq+82*0] + vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 +%endif + call %%add_noise_v + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + +%%hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 + + lea topleft_offxyd, [top_offxyq+(32>>%2)] + lea left_offxyd, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0x000f000f + and offxd, 0x000f000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + +%if %2 + movu xm21, [grain_lutq+offxyq+82*0] + movd xm16, [grain_lutq+left_offxyq+82*0] + vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 + vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1 + vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 + vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2 + vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 + vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3 + movd xm18, [grain_lutq+topleft_offxyq+82*0] + movu xm20, [grain_lutq+top_offxyq] + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m16, m21 +%if %3 + punpcklbw xm18, xm20 +%else + vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1 + vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1 + punpcklbw ym18, ym20 +%endif + punpcklqdq m16, m18 + pmaddubsw m16, m10, m16 + pmulhrsw m16, m9 + packsswb m16, m16 + vmovdqu8 m21{k1}, m16 +%if %3 + vpalignr xm20{k1}, xm16, xm16, 4 + punpcklbw xm19, xm20, xm21 + punpckhbw xm20, xm21 +%else + vpalignr ym20{k1}, ym16, ym16, 4 + punpcklbw ym19, ym20, ym21 + punpckhbw ym20, ym21 +%endif +%else + movu ym21, [grain_lutq+offxyq+82*0] + vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 + movd xm16, [grain_lutq+left_offxyq+82*0] + vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2 + movu ym20, [grain_lutq+top_offxyq+82*0] + vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 + movd xm18, [grain_lutq+topleft_offxyq+82*0] + vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2 + punpcklbw m16, m21 + punpcklbw m18, m20 + punpcklqdq m16, m18 + pmaddubsw m16, m10, m16 + pmulhrsw m16, m9 + packsswb m16, m16 + vpalignr m20{k1}, m16, m16, 4 + vmovdqu8 m21{k1}, m16 +%endif + call %%add_noise_v + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r11mp + mov dstq, r12mp + lea lumaq, [r13+wq*(1<<%2)] + add srcq, wq + add dstq, wq + jmp %%hv_overlap +ALIGN function_align +%%add_noise_v: +%if %3 + pmaddubsw xm19, xm13, xm19 + pmaddubsw xm20, xm13, xm20 + pmulhrsw xm19, xm9 + pmulhrsw xm20, xm9 + vpacksswb m21{k3}, m19, m20 +%elif %2 + pmaddubsw ym19, ym13, ym19 + pmaddubsw ym20, ym13, ym20 + pmulhrsw ym19, ym9 + pmulhrsw ym20, ym9 + vpacksswb m21{k3}, m19, m20 +%else + punpcklbw m19, m20, m21 + punpckhbw m20, m21 + pmaddubsw m19, m13, m19 + pmaddubsw m20, m13, m20 + pmulhrsw m19, m9 + pmulhrsw m20, m9 + packsswb m21, m19, m20 +%endif +%%add_noise: + punpcklbw m20, m5, m21 + punpckhbw m21, m5 +%%add_noise_h: + mova ym18, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 +%if %2 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova ym16, [lumaq+lstrideq*(0<<%3)] + vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1 + mova xm17, [srcq+strideq*0] + mova m19, m11 + vpermi2b m19, m18, m16 + vinserti128 ym17, [srcq+strideq*1], 1 + vpermt2b m18, m12, m16 + vinserti32x4 m17, [srcq+strideq*2], 2 + pavgb m18, m19 + vinserti32x4 m17, [srcq+stride3q ], 3 +%else + mova ym17, [srcq+strideq*0] + vinserti32x8 m17, [srcq+strideq*1], 1 +%endif +%if %1 + punpckhbw m19, m18, m17 + punpcklbw m18, m17 ; { luma, chroma } + pmaddubsw m19, m14 + pmaddubsw m18, m14 + psraw m19, 6 + psraw m18, 6 + paddw m19, m15 + paddw m18, m15 + packuswb m18, m19 +.add_noise_main: + mova m19, m0 + vpermt2b m19, m18, m1 ; scaling[ 0..127] + vpmovb2m k2, m18 + vpermi2b m18, m2, m3 ; scaling[128..255] + vmovdqu8 m19{k2}, m18 ; scaling[src] + pshufb m19, m4 + pmaddubsw m18, m19, m20 + pmaddubsw m19, m21 + add grain_lutq, 82*2<<%2 + lea lumaq, [lumaq+lstrideq*(2<<%3)] + lea srcq, [srcq+strideq*(2<<%2)] + pmulhrsw m18, m6 ; noise + pmulhrsw m19, m6 + punpcklbw m16, m17, m5 ; chroma + punpckhbw m17, m5 + paddw m16, m18 + paddw m17, m19 + packuswb m16, m17 + pmaxub m16, m7 + pminub m16, m8 +%if %2 + mova [dstq+strideq*0], xm16 + vextracti128 [dstq+strideq*1], ym16, 1 + vextracti32x4 [dstq+strideq*2], m16, 2 + vextracti32x4 [dstq+stride3q ], m16, 3 +%else + mova [dstq+strideq*0], ym16 + vextracti32x8 [dstq+strideq*1], m16, 1 +%endif + lea dstq, [dstq+strideq*(2<<%2)] + ret +%else + jmp .add_noise_main +%endif +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/filmgrain_common.asm b/third_party/dav1d/src/x86/filmgrain_common.asm new file mode 100644 index 0000000000..74f7044e66 --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain_common.asm @@ -0,0 +1,46 @@ +; Copyright © 2019-2022, VideoLAN and dav1d authors +; Copyright © 2019-2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +struc FGData + .seed: resd 1 + .num_y_points: resd 1 + .y_points: resb 14 * 2 + .chroma_scaling_from_luma: resd 1 + .num_uv_points: resd 2 + .uv_points: resb 2 * 10 * 2 + .scaling_shift: resd 1 + .ar_coeff_lag: resd 1 + .ar_coeffs_y: resb 24 + .ar_coeffs_uv: resb 2 * 28 ; includes padding + .ar_coeff_shift: resq 1 + .grain_scale_shift: resd 1 + .uv_mult: resd 2 + .uv_luma_mult: resd 2 + .uv_offset: resd 2 + .overlap_flag: resd 1 + .clip_to_restricted_range: resd 1 +endstruc + +cextern gaussian_sequence diff --git a/third_party/dav1d/src/x86/filmgrain_sse.asm b/third_party/dav1d/src/x86/filmgrain_sse.asm new file mode 100644 index 0000000000..0172f98760 --- /dev/null +++ b/third_party/dav1d/src/x86/filmgrain_sse.asm @@ -0,0 +1,3233 @@ +; Copyright © 2019-2021, VideoLAN and dav1d authors +; Copyright © 2019, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +SECTION_RODATA + +pw_1024: times 8 dw 1024 +pb_27_17_17_27: db 27, 17, 17, 27 + times 6 db 0, 32 +pb_23_22_h: db 23, 22 + times 7 db 0, 32 +pb_27_17: times 8 db 27, 17 +pb_17_27: times 8 db 17, 27 +pb_23_22: times 8 db 23, 22 +pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 +rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 +byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 +pw_seed_xor: times 2 dw 0xb524 + times 2 dw 0x49d8 +pb_1: times 4 db 1 +hmul_bits: dw 32768, 16384, 8192, 4096 +round: dw 2048, 1024, 512 +mul_bits: dw 256, 128, 64, 32, 16 +round_vals: dw 32, 64, 128, 256, 512 +max: dw 255, 240, 235 +min: dw 0, 16 +pw_1: dw 1 + +%macro JMP_TABLE 2-* + %xdefine %1_8bpc_%2_table %%table + %xdefine %%base %1_8bpc_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .ar%3 - %%base + %rotate 1 + %endrep +%endmacro + +JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 + +SECTION .text + +%if ARCH_X86_32 +%define PIC_ptr(a) base+a +%else +%define PIC_ptr(a) a +%endif + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data + LEA r4, $$ +%define base r4-$$ + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r2d, [fg_dataq+FGData.grain_scale_shift] + movd m2, [base+round+r2*2] + movd m0, [fg_dataq+FGData.seed] + mova m5, [base+pb_mask] + pshuflw m2, m2, q0000 + pshuflw m0, m0, q0000 + mov r2, -73*82 + sub bufq, r2 + lea r3, [base+gaussian_sequence] +.loop: + pand m6, m0, m1 + psrlw m3, m6, 10 + por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m6, m4 ; bits 0x0f00 are set + pshufb m3, m5, m6 ; set 15th bit for next 4 seeds + psllq m6, m3, 30 + por m3, m6 + psllq m6, m3, 15 + por m3, m6 ; aggregate each bit into next seed's high bit + pmulhuw m6, m0, m7 + por m3, m6 ; 4 next output seeds + pshuflw m0, m3, q3333 + psrlw m3, 5 +%if ARCH_X86_64 + movq r6, m3 + mov r8, r6 + movzx r5d, r6w + shr r6d, 16 + shr r8, 32 + movzx r7, r8w + shr r8, 16 + + movd m6, [r3+r5*2] + pinsrw m6, [r3+r6*2], 1 + pinsrw m6, [r3+r7*2], 2 + pinsrw m6, [r3+r8*2], 3 +%else + movd r6, m3 + pshuflw m3, m3, q3232 + movzx r5, r6w + shr r6, 16 + + movd m6, [r3+r5*2] + pinsrw m6, [r3+r6*2], 1 + + movd r6, m3 + movzx r5, r6w + shr r6, 16 + + pinsrw m6, [r3+r5*2], 2 + pinsrw m6, [r3+r6*2], 3 +%endif + pmulhrsw m6, m2 + packsswb m6, m6 + movd [bufq+r2], m6 + add r2, 4 + jl .loop + + ; auto-regression code + movsxd r2, [fg_dataq+FGData.ar_coeff_lag] + movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] + lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] + jmp r2 + +.ar1: +%if ARCH_X86_32 + DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max +%elif WIN64 + DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 + mov bufq, r0 +%else + DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 +%endif + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] + movd m4, [fg_dataq+FGData.ar_coeffs_y] + mov ecx, [fg_dataq+FGData.ar_coeff_shift] +%if ARCH_X86_32 + mov r1m, cf3d + DEFINE_ARGS buf, shift, val3, min, max, x, val0 +%define hd r0mp +%define cf3d r1mp +%elif WIN64 + DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 +%else + DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 +%endif + pxor m6, m6 + pcmpgtb m7, m6, m4 + punpcklbw m4, m7 + pinsrw m4, [base+pw_1], 3 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + movd m3, [base+round_vals+shiftq*2-12] ; rnd + pshuflw m3, m3, q0000 + sub bufq, 82*73-(82*3+79) + mov hd, 70 + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -76 + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: + movq m0, [bufq+xq-82-1] ; top/left + pcmpgtb m7, m6, m0 + punpcklbw m0, m7 + psrldq m2, m0, 2 ; top + psrldq m1, m0, 4 ; top/right + punpcklwd m0, m2 + punpcklwd m1, m3 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 + imul val3d, cf3d + add val3d, val0d + sar val3d, shiftb + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 + dec hd + jg .y_loop_ar1 +.ar0: + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -16*8 +%endif + DEFINE_ARGS buf, fg_data, shift + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m6, [base+round_vals-12+shiftq*2] + movd m7, [base+byte_blend+1] + SCRATCH 7, 15, 7 + movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 + movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 + pxor m7, m7 + pshuflw m6, m6, q0000 + punpcklwd m6, m7 + pcmpgtb m4, m7, m0 + pcmpgtb m5, m7, m1 + punpcklbw m0, m4 + punpcklbw m1, m5 + DEFINE_ARGS buf, fg_data, h, x + pshufd m4, m1, q0000 + pshufd m5, m1, q1111 + pshufd m3, m0, q3333 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + SCRATCH 6, 14, 6 + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar2: + mov xq, -76 + +.x_loop_ar2: + movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pcmpgtb m2, m7, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + psrldq m5, m0, 2 ; y=-2,x=[-1,+5] + psrldq m3, m1, 2 ; y=-1,x=[-1,+5] + psrldq m4, m1, 4 ; y=-1,x=[+0,+5] + punpcklwd m2, m0, m5 + punpcklwd m3, m4 + pmaddwd m2, m8 + pmaddwd m3, m11 + paddd m2, m3 + + psrldq m4, m0, 4 ; y=-2,x=[+0,+5] + psrldq m5, m0, 6 ; y=-2,x=[+1,+5] + psrldq m6, m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m4, m5 + punpcklwd m6, m1 + psrldq m5, m1, 6 ; y=-1,x=[+1,+5] + psrldq m1, m1, 8 ; y=-1,x=[+2,+5] + punpcklwd m5, m1 + pmaddwd m4, m9 + pmaddwd m6, m10 + pmaddwd m5, m12 + paddd m4, m6 + paddd m2, m5 + paddd m2, m4 + paddd m2, m14 + + movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] +.x_loop_ar2_inner: + pcmpgtb m4, m7, m0 + punpcklbw m1, m0, m4 + pmaddwd m3, m1, m13 + paddd m3, m2 + psrldq m1, 4 ; y=0,x=0 + psrldq m2, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + paddw m3, m1 + packsswb m3, m3 + pslldq m3, 2 + pand m3, m15 + pandn m1, m15, m0 + por m0, m1, m3 + psrldq m0, 1 + ; overwrite 2 pixels, but that's ok + movd [bufq+xq-1], m0 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + dec hd + jg .y_loop_ar2 + RET + +.ar3: + DEFINE_ARGS buf, fg_data, shift +%if ARCH_X86_32 +%assign stack_offset stack_offset_old + ALLOC_STACK -16*14 +%elif WIN64 + SUB rsp, 16*6 +%assign stack_size_padded (stack_size_padded+16*6) +%assign stack_size (stack_size+16*6) +%else + ALLOC_STACK -16*6 +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m6, [base+round_vals-12+shiftq*2] + movd m7, [base+byte_blend] + movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 + movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 + pxor m3, m3 + pcmpgtb m4, m3, m0 + pcmpgtb m3, m2 + pshuflw m6, m6, q0000 + SCRATCH 6, 14, 12 + SCRATCH 7, 15, 13 + punpckhbw m1, m0, m4 + punpcklbw m0, m4 + punpcklbw m2, m3 + pshufd m3, m0, q1111 + pshufd m4, m0, q2222 + pshufd m5, m0, q3333 + pshufd m0, m0, q0000 + mova [rsp+ 0*16], m0 + mova [rsp+ 1*16], m3 + mova [rsp+ 2*16], m4 + mova [rsp+ 3*16], m5 + pshufd m6, m1, q1111 + pshufd m7, m1, q2222 + pshufd m5, m1, q3333 + pshufd m1, m1, q0000 + pshufd m3, m2, q1111 + psrldq m0, m2, 10 + pinsrw m2, [base+pw_1], 5 + pshufd m4, m2, q2222 + pshufd m2, m2, q0000 + pinsrw m0, [base+round_vals+shiftq*2-10], 3 + mova [rsp+ 4*16], m1 + mova [rsp+ 5*16], m6 + SCRATCH 7, 8, 6 + SCRATCH 5, 9, 7 + SCRATCH 2, 10, 8 + SCRATCH 3, 11, 9 + SCRATCH 4, 12, 10 + SCRATCH 0, 13, 11 + DEFINE_ARGS buf, fg_data, h, x + sub bufq, 82*73-(82*3+79) + mov hd, 70 +.y_loop_ar3: + mov xq, -76 + +.x_loop_ar3: + movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + pxor m3, m3 + pcmpgtb m3, m0 + punpckhbw m2, m0, m3 + punpcklbw m0, m3 + + psrldq m5, m0, 2 + psrldq m6, m0, 4 + psrldq m7, m0, 6 + punpcklwd m4, m0, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 0*16] + pmaddwd m6, [rsp+ 1*16] + paddd m4, m6 + + movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + pxor m5, m5 + pcmpgtb m5, m1 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + palignr m6, m2, m0, 10 + palignr m7, m2, m0, 12 + psrldq m0, 8 + punpcklwd m0, m6 + punpcklwd m7, m1 + pmaddwd m0, [rsp+ 2*16] + pmaddwd m7, [rsp+ 3*16] + paddd m0, m7 + paddd m0, m4 + + psrldq m4, m1, 2 + psrldq m5, m1, 4 + psrldq m6, m1, 6 + psrldq m7, m1, 8 + punpcklwd m4, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 4*16] + pmaddwd m6, [rsp+ 5*16] + paddd m4, m6 + paddd m0, m4 + + movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor m7, m7 + pcmpgtb m7, m2 + punpckhbw m5, m2, m7 + punpcklbw m2, m7 + palignr m7, m3, m1, 10 + palignr m3, m1, 12 + psrldq m1, m2, 2 + punpcklwd m7, m3 + punpcklwd m3, m2, m1 + pmaddwd m7, m8 + pmaddwd m3, m9 + paddd m7, m3 + paddd m0, m7 + + psrldq m6, m2, 4 + psrldq m1, m2, 6 + psrldq m3, m2, 8 + palignr m4, m5, m2, 10 + palignr m5, m5, m2, 12 + + punpcklwd m6, m1 + punpcklwd m3, m4 + punpcklwd m5, m14 + pmaddwd m6, m10 + pmaddwd m3, m11 + pmaddwd m5, m12 + paddd m0, m6 + paddd m3, m5 + paddd m0, m3 + + movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] +.x_loop_ar3_inner: + pxor m5, m5 + pcmpgtb m5, m1 + punpcklbw m2, m1, m5 + pmaddwd m2, m13 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw since we only care about one value + packsswb m2, m2 + pslldq m2, 3 + pand m2, m15 + pandn m3, m15, m1 + por m1, m2, m3 + movd [bufq+xq-3], m1 + psrldq m1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + dec hd + jg .y_loop_ar3 + RET + +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y +INIT_XMM ssse3 +cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv + movifnidn r2, r2mp + movifnidn r3, r3mp + LEA r4, $$ +%define base r4-$$ + movq m1, [base+rnd_next_upperbit_mask] + movq m4, [base+mul_bits] + movq m7, [base+hmul_bits] + mov r5d, [fg_dataq+FGData.grain_scale_shift] + movd m6, [base+round+r5*2] + mova m5, [base+pb_mask] + movd m0, [fg_dataq+FGData.seed] + movd m2, [base+pw_seed_xor+uvq*4] + pxor m0, m2 + pshuflw m6, m6, q0000 + pshuflw m0, m0, q0000 + lea r6, [base+gaussian_sequence] +%if %2 +%if ARCH_X86_64 + mov r7d, 73-35*%3 +%else + mov r3mp, 73-35*%3 +%endif + add bufq, 44 +.loop_y: + mov r5, -44 +.loop_x: +%else + mov r5, -82*73 + sub bufq, r5 +.loop: +%endif + pand m2, m0, m1 + psrlw m3, m2, 10 + por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set + pmullw m2, m4 ; bits 0x0f00 are set + pshufb m3, m5, m2 ; set 15th bit for next 4 seeds + psllq m2, m3, 30 + por m3, m2 + psllq m2, m3, 15 + por m3, m2 ; aggregate each bit into next seed's high bit + pmulhuw m2, m0, m7 + por m2, m3 ; 4 next output seeds + pshuflw m0, m2, q3333 + psrlw m2, 5 +%if ARCH_X86_64 + movd r9d, m2 + pshuflw m2, m2, q3232 + movzx r8, r9w + shr r9, 16 + + movd m3, [r6+r8*2] + pinsrw m3, [r6+r9*2], 1 + + movd r9d, m2 + movzx r8, r9w + shr r9, 16 + + pinsrw m3, [r6+r8*2], 2 + pinsrw m3, [r6+r9*2], 3 +%else + movd r2, m2 + pshuflw m2, m2, q3232 + movzx r1, r2w + shr r2, 16 + + movd m3, [r6+r1*2] + pinsrw m3, [r6+r2*2], 1 + + movd r2, m2 + movzx r1, r2w + shr r2, 16 + + pinsrw m3, [r6+r1*2], 2 + pinsrw m3, [r6+r2*2], 3 +%endif + pmulhrsw m3, m6 + packsswb m3, m3 + movd [bufq+r5], m3 + add r5, 4 +%if %2 + jl .loop_x + add bufq, 82 +%if ARCH_X86_64 + dec r7d +%else + dec r3mp +%endif + jg .loop_y +%else + jl .loop +%endif + +%if ARCH_X86_32 + mov r2, r2mp +%endif + + ; auto-regression code + movsxd r5, [fg_dataq+FGData.ar_coeff_lag] + movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] + jmp r5 + +.ar0: + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp +%if ARCH_X86_32 +%assign stack_offset_old stack_offset + ALLOC_STACK -2*16 +%endif + imul uvd, 28 + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] + movd m4, [base+hmul_bits+shiftq*2] + DEFINE_ARGS buf, bufy, h, x + pxor m0, m0 + pcmpgtb m0, m5 + punpcklbw m5, m0 + movd m7, [base+pb_1] +%if %2 + movd m6, [base+hmul_bits+2+%3*2] +%endif + pshuflw m5, m5, q0000 + pshuflw m4, m4, q0000 + pshufd m7, m7, q0000 +%if %2 + pshuflw m6, m6, q0000 +%endif + punpcklqdq m5, m5 + punpcklqdq m4, m4 +%if %2 + punpcklqdq m6, m6 +%endif + pcmpeqw m1, m1 + pslldq m1, 12>>%2 + SCRATCH 1, 8, 0 + SCRATCH 4, 9, 1 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif + add bufyq, 3+82*3 + mov hd, 70-35*%3 +.y_loop_ar0: + xor xd, xd +.x_loop_ar0: + ; first 32 pixels +%if %2 + movu m1, [bufyq+xq*2] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + movu m3, [bufyq+xq*2+16] +%if %3 + movu m4, [bufyq+xq*2+82+16] +%endif + pmaddubsw m0, m7, m1 +%if %3 + pmaddubsw m1, m7, m2 +%endif + pmaddubsw m2, m7, m3 +%if %3 + pmaddubsw m3, m7, m4 + paddw m0, m1 + paddw m2, m3 +%endif + pmulhrsw m0, m6 + pmulhrsw m2, m6 +%else + movu m0, [bufyq+xq] + pxor m6, m6 + pcmpgtb m6, m0 + punpckhbw m2, m0, m6 + punpcklbw m0, m6 +%endif + pmullw m0, m5 + pmullw m2, m5 + pmulhrsw m0, m9 + pmulhrsw m2, m9 + movu m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpckhbw m3, m1, m4 +%if %2 + punpcklbw m1, m4 + paddw m2, m3 + paddw m0, m1 +%else + punpcklbw m6, m1, m4 + paddw m2, m3 + paddw m0, m6 +%endif + packsswb m0, m2 +%if %2 + movu [bufq+xq], m0 + add xd, 16 + cmp xd, 32 + jl .x_loop_ar0 + + ; last 6/12 pixels + movu m1, [bufyq+xq*(1+%2)] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + pmaddubsw m0, m7, m1 +%if %3 + pmaddubsw m1, m7, m2 + paddw m0, m1 +%endif + pmulhrsw m0, m6 + pmullw m0, m5 + pmulhrsw m0, m9 + movq m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + paddw m0, m2 + packsswb m0, m0 + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movq [bufq+xq], m2 +%else + add xd, 16 + cmp xd, 80 + je .y_loop_final_ar0 + movu [bufq+xq-16], m0 + jmp .x_loop_ar0 +.y_loop_final_ar0: + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movu [bufq+xq-16], m2 +%endif + + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar0 + RET + +.ar1: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x + imul uvd, 28 + movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] + movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] + pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 +%if ARCH_X86_32 + mov r3mp, cf3d + DEFINE_ARGS buf, shift, fg_data, val3, min, max, x +%elif WIN64 + DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x + mov bufq, r0 +%else + DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + movd m3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 + movd m7, [base+pb_1] + movd m6, [base+hmul_bits+2+%3*2] +%endif + psrldq m4, 1 +%if ARCH_X86_32 + DEFINE_ARGS buf, shift, val0, val3, min, max, x +%elif WIN64 + DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 +%else + DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 +%endif + pxor m5, m5 + punpcklwd m3, m5 +%if %2 + punpcklwd m6, m6 +%endif + pcmpgtb m5, m4 + punpcklbw m4, m5 + pshufd m5, m4, q1111 + pshufd m4, m4, q0000 + pshufd m3, m3, q0000 +%if %2 + pshufd m7, m7, q0000 + pshufd m6, m6, q0000 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif +%if ARCH_X86_32 + add r1mp, 79+82*3 + mov r0mp, 70-35*%3 +%else + add bufyq, 79+82*3 + mov hd, 70-35*%3 +%endif + mov mind, -128 + mov maxd, 127 +.y_loop_ar1: + mov xq, -(76>>%2) + movsx val3d, byte [bufq+xq-1] +.x_loop_ar1: +%if %2 +%if ARCH_X86_32 + mov r2, r1mp + movq m0, [r2+xq*2] +%if %3 + movq m1, [r2+xq*2+82] +%endif +%else + movq m0, [bufyq+xq*2] +%if %3 + movq m1, [bufyq+xq*2+82] +%endif +%endif + pmaddubsw m2, m7, m0 +%if %3 + pmaddubsw m0, m7, m1 + paddw m2, m0 +%endif + pmulhrsw m2, m6 +%else +%if ARCH_X86_32 + mov r2, r1mp + movd m2, [r2+xq] +%else + movd m2, [bufyq+xq] +%endif + pxor m0, m0 + pcmpgtb m0, m2 + punpcklbw m2, m0 +%endif + + movq m0, [bufq+xq-82-1] ; top/left + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 + psrldq m1, m0, 4 ; top/right + punpcklwd m1, m2 + psrldq m2, m0, 2 ; top + punpcklwd m0, m2 + pmaddwd m0, m4 + pmaddwd m1, m5 + paddd m0, m1 + paddd m0, m3 +.x_loop_ar1_inner: + movd val0d, m0 + psrldq m0, 4 +%if ARCH_X86_32 + imul val3d, r3mp +%else + imul val3d, cf3d +%endif + add val3d, val0d + sar val3d, shiftb + movsx val0d, byte [bufq+xq] + add val3d, val0d + cmp val3d, maxd + cmovns val3d, maxd + cmp val3d, mind + cmovs val3d, mind + mov byte [bufq+xq], val3b + ; keep val3d in-place as left for next x iteration + inc xq + jz .x_loop_ar1_end + test xq, 3 + jnz .x_loop_ar1_inner + jmp .x_loop_ar1 + +.x_loop_ar1_end: + add bufq, 82 +%if ARCH_X86_32 + add r1mp, 82<<%3 + dec r0mp +%else + add bufyq, 82<<%3 + dec hd +%endif + jg .y_loop_ar1 + RET + +.ar2: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp + ALLOC_STACK -8*16 +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + movd m7, [base+round_vals-12+shiftq*2] + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 + pxor m2, m2 + pcmpgtb m2, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + pinsrw m1, [base+pw_1], 5 + punpcklwd m7, m7 + pshufd m7, m7, q0000 + DEFINE_ARGS buf, bufy, fg_data, h, unused, x + pshufd m4, m1, q0000 + pshufd m5, m1, q1111 + pshufd m6, m1, q2222 + pshufd m3, m0, q3333 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + SCRATCH 0, 8, 0 + SCRATCH 1, 9, 1 + SCRATCH 2, 10, 2 + SCRATCH 3, 11, 3 + SCRATCH 4, 12, 4 + SCRATCH 5, 13, 5 + SCRATCH 6, 14, 6 + SCRATCH 7, 15, 7 +%if %2 + movd m7, [base+hmul_bits+2+%3*2] + movd m6, [base+pb_1] + punpcklwd m7, m7 + pshufd m6, m6, q0000 + pshufd m7, m7, q0000 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar2: + mov xq, -(76>>%2) + +.x_loop_ar2: + pxor m2, m2 + movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] + movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] + pcmpgtb m2, m0 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 + psrldq m5, m0, 2 ; y=-2,x=[-1,+5] + psrldq m3, m1, 2 ; y=-1,x=[-1,+5] + psrldq m4, m1, 4 ; y=-1,x=[+0,+5] + punpcklwd m2, m0, m5 + punpcklwd m3, m4 + pmaddwd m2, m8 + pmaddwd m3, m11 + paddd m2, m3 + + psrldq m4, m0, 4 ; y=-2,x=[+0,+5] + psrldq m5, m0, 6 ; y=-2,x=[+1,+5] + psrldq m0, 8 ; y=-2,x=[+2,+5] + punpcklwd m4, m5 + punpcklwd m0, m1 + psrldq m3, m1, 6 ; y=-1,x=[+1,+5] + psrldq m1, m1, 8 ; y=-1,x=[+2,+5] + punpcklwd m3, m1 + pmaddwd m4, m9 + pmaddwd m0, m10 + pmaddwd m3, m12 + paddd m4, m0 + paddd m2, m3 + paddd m2, m4 + +%if %2 + movq m1, [bufyq+xq*2] +%if %3 + movq m3, [bufyq+xq*2+82] +%endif + pmaddubsw m0, m6, m1 +%if %3 + pmaddubsw m1, m6, m3 + paddw m0, m1 +%endif + pmulhrsw m0, m7 +%else + movd m0, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 +%endif + punpcklwd m0, m15 + pmaddwd m0, m14 + paddd m2, m0 + + movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] + pxor m4, m4 + movd m5, [base+byte_blend+1] + punpcklbw m5, m5 +.x_loop_ar2_inner: + pcmpgtb m1, m4, m0 + punpcklbw m0, m1 + pmaddwd m3, m0, m13 + paddd m3, m2 + psrldq m2, 4 ; shift top to next pixel + psrad m3, [fg_dataq+FGData.ar_coeff_shift] + pslldq m3, 4 + pand m3, m5 + paddw m0, m3 + packsswb m0, m0 + movd [bufq+xq-2], m0 + psrldq m0, 1 + inc xq + jz .x_loop_ar2_end + test xq, 3 + jnz .x_loop_ar2_inner + jmp .x_loop_ar2 + +.x_loop_ar2_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar2 + RET + +.ar3: +%if ARCH_X86_32 +%assign stack_offset stack_offset_old +%assign stack_size_padded 0 +%xdefine rstk rsp +%endif + DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift + movifnidn bufyq, bufymp +%if ARCH_X86_32 + ALLOC_STACK -15*16 +%else + SUB rsp, 16*7 +%assign stack_size_padded (stack_size_padded+16*7) +%assign stack_size (stack_size+16*7) +%endif + mov shiftd, [fg_dataq+FGData.ar_coeff_shift] + imul uvd, 28 + + movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 + pxor m3, m3 + pcmpgtb m3, m0 + punpckhbw m1, m0, m3 + punpcklbw m0, m3 + pshufd m2, m0, q1111 + pshufd m3, m0, q2222 + pshufd m4, m0, q3333 + pshufd m0, m0, q0000 + pshufd m5, m1, q1111 + pshufd m6, m1, q2222 + pshufd m7, m1, q3333 + pshufd m1, m1, q0000 + mova [rsp+ 0*16], m0 + mova [rsp+ 1*16], m2 + mova [rsp+ 2*16], m3 + mova [rsp+ 3*16], m4 + mova [rsp+ 4*16], m1 + mova [rsp+ 5*16], m5 + mova [rsp+ 6*16], m6 + SCRATCH 7, 8, 7 + + movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] + pxor m4, m4 + pcmpgtb m4, m2 + punpckhbw m5, m2, m4 + punpcklbw m2, m4 + pshufd m4, m2, q3232 + punpcklwd m3, m4, m5 + pshuflw m5, m4, q3321 + pshufd m4, m3, q0000 + pshufd m3, m2, q1111 + pshufd m2, m2, q0000 + pinsrw m5, [base+round_vals+shiftq*2-10], 3 + SCRATCH 2, 9, 8 + SCRATCH 3, 10, 9 + SCRATCH 4, 11, 10 + SCRATCH 5, 12, 11 + + movd m2, [base+round_vals-12+shiftq*2] +%if %2 + movd m1, [base+pb_1] + movd m3, [base+hmul_bits+2+%3*2] +%endif + pxor m0, m0 + punpcklwd m2, m0 +%if %2 + punpcklwd m3, m3 +%endif + pshufd m2, m2, q0000 +%if %2 + pshufd m1, m1, q0000 + pshufd m3, m3, q0000 + SCRATCH 1, 13, 12 +%endif + SCRATCH 2, 14, 13 +%if %2 + SCRATCH 3, 15, 14 +%endif + + DEFINE_ARGS buf, bufy, fg_data, h, unused, x +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif + add bufyq, 79+82*3 + mov hd, 70-35*%3 +.y_loop_ar3: + mov xq, -(76>>%2) + +.x_loop_ar3: + movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] + pxor m4, m4 + pcmpgtb m4, m0 + punpckhbw m3, m0, m4 + punpcklbw m0, m4 + + psrldq m5, m0, 2 + psrldq m6, m0, 4 + psrldq m7, m0, 6 + punpcklwd m4, m0, m5 + punpcklwd m6, m7 + pmaddwd m4, [rsp+ 0*16] + pmaddwd m6, [rsp+ 1*16] + paddd m4, m6 + + palignr m2, m3, m0, 10 + palignr m3, m0, 12 + psrldq m0, 8 + + movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] + pxor m6, m6 + pcmpgtb m6, m1 + punpckhbw m5, m1, m6 + punpcklbw m1, m6 + + punpcklwd m0, m2 + punpcklwd m3, m1 + pmaddwd m0, [rsp+ 2*16] + pmaddwd m3, [rsp+ 3*16] + paddd m0, m3 + paddd m0, m4 + + movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] + pxor m7, m7 + pcmpgtb m7, m2 + punpckhbw m6, m2, m7 + punpcklbw m2, m7 + + palignr m3, m5, m1, 10 + palignr m5, m1, 12 + psrldq m4, m2, 2 + + punpcklwd m3, m5 + punpcklwd m5, m2, m4 + pmaddwd m3, [rsp+ 6*16] + pmaddwd m5, m8 + paddd m3, m5 + paddd m0, m3 + + psrldq m3, m1, 2 + psrldq m4, m1, 4 + psrldq m5, m1, 6 + psrldq m1, 8 + + punpcklwd m3, m4 + punpcklwd m5, m1 + pmaddwd m3, [rsp+ 4*16] + pmaddwd m5, [rsp+ 5*16] + paddd m3, m5 + paddd m0, m3 + +%if %2 + movq m1, [bufyq+xq*2] +%if %3 + movq m3, [bufyq+xq*2+82] +%endif + pmaddubsw m7, m13, m1 +%if %3 + pmaddubsw m5, m13, m3 + paddw m7, m5 +%endif + pmulhrsw m7, m15 +%else + movd m7, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m7 + punpcklbw m7, m1 +%endif + + psrldq m1, m2, 4 + psrldq m3, m2, 6 + palignr m4, m6, m2, 10 + palignr m6, m2, 12 + psrldq m2, 8 + + punpcklwd m1, m3 + punpcklwd m2, m4 + punpcklwd m6, m7 + pmaddwd m1, m9 + pmaddwd m2, m10 + pmaddwd m6, m11 + paddd m1, m2 + paddd m0, m6 + paddd m0, m1 + paddd m0, m14 + + movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] + pxor m4, m4 + movd m5, [base+byte_blend] +.x_loop_ar3_inner: + pcmpgtb m2, m4, m1 + punpcklbw m3, m1, m2 + pmaddwd m2, m3, m12 + pshufd m3, m2, q1111 + paddd m2, m3 ; left+cur + paddd m2, m0 ; add top + psrldq m0, 4 + psrad m2, [fg_dataq+FGData.ar_coeff_shift] + ; don't packssdw, we only care about one value + packsswb m2, m2 + pandn m3, m5, m1 + pslld m2, 24 + pand m2, m5 + por m1, m2, m3 + movd [bufq+xq-3], m1 + psrldq m1, 1 + inc xq + jz .x_loop_ar3_end + test xq, 3 + jnz .x_loop_ar3_inner + jmp .x_loop_ar3 + +.x_loop_ar3_end: + add bufq, 82 + add bufyq, 82<<%3 + dec hd + jg .y_loop_ar3 + RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 + +%macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg +%assign %%idx 0 +%define %%tmp %2 +%if %0 == 6 +%define %%tmp %6 +%endif +%rep 4 +%if %%idx == 0 + movd %5 %+ d, %2 + pshuflw %%tmp, %2, q3232 +%else + movd %5 %+ d, %%tmp +%if %%idx == 2 + punpckhqdq %%tmp, %%tmp +%elif %%idx == 4 + psrlq %%tmp, 32 +%endif +%endif + movzx %4 %+ d, %5 %+ w + shr %5 %+ d, 16 + +%if %%idx == 0 + movd %1, [%3+%4] +%else + pinsrw %1, [%3+%4], %%idx + 0 +%endif + pinsrw %1, [%3+%5], %%idx + 1 +%assign %%idx %%idx+2 +%endrep +%endmacro + +INIT_XMM ssse3 +; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) +%if ARCH_X86_32 +%if STACK_ALIGNMENT < mmsize +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ + dst, src, scaling, unused1, fg_data, picptr, unused2 + ; copy stack arguments to new position post-alignment, so that we + ; don't have to keep the old stack location in a separate register + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov r5, r8m + + mov [rsp+5*mmsize+ 4*gprsize], r0 + mov [rsp+5*mmsize+ 6*gprsize], r1 + mov [rsp+5*mmsize+ 8*gprsize], r2 + mov [rsp+5*mmsize+10*gprsize], r3 + mov [rsp+5*mmsize+11*gprsize], r4 + mov [rsp+5*mmsize+12*gprsize], r5 +%else +cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ + dst, src, scaling, unused1, fg_data, picptr, unused2 +%endif + mov srcq, srcm + mov fg_dataq, r3m + mov scalingq, r5m +%if STACK_ALIGNMENT < mmsize +%define r0m [rsp+5*mmsize+ 4*gprsize] +%define r1m [rsp+5*mmsize+ 5*gprsize] +%define r2m [rsp+5*mmsize+ 6*gprsize] +%define r3m [rsp+5*mmsize+ 7*gprsize] +%define r4m [rsp+5*mmsize+ 8*gprsize] +%define r5m [rsp+5*mmsize+ 9*gprsize] +%define r6m [rsp+5*mmsize+10*gprsize] +%define r7m [rsp+5*mmsize+11*gprsize] +%define r8m [rsp+5*mmsize+12*gprsize] +%endif + LEA r5, pb_mask +%define base r5-pb_mask + mov r5m, picptrq +%else +cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut + lea r7, [pb_mask] +%define base r7-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + movd m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + movd m4, [base+max+r6*4] + movd m5, [base+min+r6*2] + punpcklwd m3, m3 + punpcklwd m4, m4 + punpcklwd m5, m5 + pshufd m3, m3, q0000 + pshufd m4, m4, q0000 + pshufd m5, m5, q0000 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap +%endif + + mov sbyd, r8m + mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 + test overlapd, overlapd + jz .no_vertical_overlap + mova m6, [base+pw_1024] + mova m7, [base+pb_27_17_17_27] + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 + test sbyd, sbyd + jnz .vertical_overlap + ; fall-through + +.no_vertical_overlap: + mov r8m, overlapd +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + unused1, unused2, see, unused3 +%endif + + lea src_bakq, [srcq+wq] + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r4m, wq + DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 +%endif + +.loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, unused + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, + ; r6m=grain_lut, r7m=h, r8m=overlap_v|h + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, unused +%endif + +.loop_x_odd: + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] + pcmpgtb m7, m2, m3 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m4 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + add srcq, r2mp + add grain_lutq, 82 + dec hd + jg .loop_y + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r1mp + add srcq, r4mp +%else + lea srcq, [src_bakq+wq] +%endif + btc dword r8m, 2 + jc .next_blk + + add offxyd, 16 + test dword r8m, 2 ; r8m & 2 = have_top_overlap + jz .loop_x_odd + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxyd +%endif + jnz .loop_x_odd_v_overlap + +.next_blk: + test dword r8m, 1 + jz .loop_x + + test dword r8m, 2 + jnz .loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: +%if ARCH_X86_32 + ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, + ; r6m=grain_lut, r7m=h, r8m=overlap_v|h + DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 + + add offxyd, 16 ; left_offxyd + mov [rsp+5*mmsize+0*gprsize], offxyd + + DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 + + mov seed, r3m +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy + + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%endif + + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164 + lea offyq, [offyq+offxq*2+747] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_h_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+0*gprsize] + movd m7, [grain_lutq+r5] +%else + movd m7, [grain_lutq+left_offxyq] +%endif + punpcklbw m7, m3 + pmaddubsw m6, m15, m7 + pmulhrsw m6, m14 + packsswb m6, m6 + shufps m6, m3, q3210 + pcmpgtb m2, m6 + punpcklbw m7, m6, m2 + punpckhbw m6, m2 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m7, m4 + pmullw m6, m5 + pmulhrsw m7, m11 + pmulhrsw m6, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m7 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + add srcq, r2mp + add grain_lutq, 82 + dec hd + jg .loop_y_h_overlap + +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end +%if ARCH_X86_32 + mov srcq, r1m + add srcq, r4m +%else + lea srcq, [src_bakq+wq] +%endif + xor dword r8m, 4 + add offxyd, 16 + + ; since this half-block had left-overlap, the next does not + test dword r8m, 2 ; have_top_overlap + jz .loop_x_odd +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add r11d, 16 ; top_offxyd +%endif + jmp .loop_x_odd_v_overlap + +.end: + RET + +.vertical_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap +%endif + + or overlapd, 2 ; top_overlap: overlap & 2 + mov r8m, overlapd + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul tmpd, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add tmpd, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and tmpd, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, tmpd +%if ARCH_X86_32 + xor sbyd, seed ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + tmp, unused2, see, unused3 +%endif + + lea src_bakq, [srcq+wq] + neg wq + sub dstmp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r4m, wq + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%endif + +.loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m +%endif + ; we assume from the block above that bits 8-15 of tmpd are zero'ed, + ; because of the 'and tmpd, 0x00ff00ff' above + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, unused, top_offxy + + mov offyd, seed + mov offxd, seed +%endif + + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, unused, top_offxy +%endif + + movzx top_offxyd, offxyw +%if ARCH_X86_32 + mov [rsp+5*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + shr offxyd, 16 + +.loop_x_odd_v_overlap: +%if ARCH_X86_32 + mov r5, r5m + lea r5, [base+pb_27_17] + mov [rsp+5*mmsize+12], r5 +%else + mova m8, [pb_27_17] +%endif + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_v_overlap: + ; src + mova m0, [srcq] + pxor m2, m2 + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m4, m0, scalingq-1, r0, r5, m3 + vpgatherdw m5, m1, scalingq-1, r0, r5, m3 +%else + vpgatherdw m4, m0, scalingq-1, r12, r13, m3 + vpgatherdw m5, m1, scalingq-1, r12, r13, m3 +%endif + REPX {psrlw x, 8}, m4, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+1*gprsize] + movu m7, [grain_lutq+r5] +%else + movu m7, [grain_lutq+top_offxyq] +%endif + punpckhbw m6, m7, m3 + punpcklbw m7, m3 +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+12] + pmaddubsw m3, [r5], m6 + pmaddubsw m6, [r5], m7 +%else + pmaddubsw m3, m8, m6 + pmaddubsw m6, m8, m7 +%endif + pmulhrsw m3, m14 + pmulhrsw m6, m14 + packsswb m6, m3 + pcmpgtb m7, m2, m6 + punpcklbw m2, m6, m7 + punpckhbw m6, m7 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m4 + pmullw m6, m5 + pmulhrsw m2, m11 + pmulhrsw m6, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m6 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+12], mmsize +%else + mova m8, [pb_17_27] +%endif + add srcq, r2mp + add grain_lutq, 82 + dec hw + jz .end_y_v_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_v_overlap + jmp .loop_y + +.end_y_v_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r1mp + add srcq, r4mp +%else + lea srcq, [src_bakq+wq] +%endif + btc dword r8m, 2 + jc .loop_x_hv_overlap + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + jmp .loop_x_odd_v_overlap + +.loop_x_hv_overlap: +%if ARCH_X86_32 + mov r5, r5m + lea r5, [base+pb_27_17] + mov [rsp+5*mmsize+12], r5 + + DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak + + mov r5, [rsp+5*mmsize+1*gprsize] + mov r4, offxyd + add r5, 16 + add r4, 16 + mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy + mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy + + DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak + + xor tmpd, tmpd + mov seed, r3m +%else + mova m8, [pb_27_17] + + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + tmp, unused2, see, unused3 + + ; we assume from the block above that bits 8-15 of tmpd are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offyq+16] + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*2+0x10001*747+32*82] + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut + + movzx r5, offxyw ; top_offxy + mov [rsp+5*mmsize+1*gprsize], r5 +%else + DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy + + movzx top_offxyd, offxyw +%endif + shr offxyd, 16 + + mov hd, r7m + mov grain_lutq, grain_lutmp +.loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy + mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy + movu m6, [grain_lutq+r5] + mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy + movd m4, [grain_lutq+r0] + movd m7, [grain_lutq+r5] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] + movd m7, [grain_lutq+topleft_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m4, m3 + punpcklbw m7, m6 + pmaddubsw m2, m15, m4 + pmaddubsw m4, m15, m7 + pmulhrsw m2, m14 + pmulhrsw m4, m14 + packsswb m2, m2 + packsswb m4, m4 + shufps m2, m3, q3210 + shufps m4, m6, q3210 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m3, m4, m2 + punpckhbw m4, m2 +%if ARCH_X86_32 + mov r5, [rsp+5*mmsize+12] + pmaddubsw m7, [r5], m4 + pmaddubsw m4, [r5], m3 +%else + pmaddubsw m7, m8, m4 + pmaddubsw m4, m8, m3 +%endif + pmulhrsw m7, m14 + pmulhrsw m4, m14 + packsswb m4, m7 + pxor m2, m2 + pcmpgtb m7, m2, m4 + punpcklbw m3, m4, m7 + punpckhbw m4, m7 + + ; src + mova m0, [srcq] + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m5, m0, scalingq-1, r0, r5, m7 + vpgatherdw m6, m1, scalingq-1, r0, r5, m7 +%else + vpgatherdw m5, m0, scalingq-1, r13, r14, m7 + vpgatherdw m6, m1, scalingq-1, r13, r14, m7 +%endif + REPX {psrlw x, 8}, m5, m6 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m3, m5 + pmullw m4, m6 + pmulhrsw m3, m11 + pmulhrsw m4, m11 + + ; dst = clip_pixel(src, noise) + paddw m0, m3 + paddw m1, m4 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add dword [rsp+5*mmsize+12], mmsize +%else + mova m8, [pb_17_27] +%endif + add srcq, r2mp + add grain_lutq, 82 + dec hw + jz .end_y_hv_overlap + ; 2 lines get vertical overlap, then fall back to non-overlap code for + ; remaining (up to) 30 lines + btc hd, 16 + jnc .loop_y_hv_overlap + jmp .loop_y_h_overlap + +.end_y_hv_overlap: +%if ARCH_X86_32 + add r4mp, 16 +%else + add wq, 16 +%endif + jge .end_hv +%if ARCH_X86_32 + mov srcq, r1m + add srcq, r4m +%else + lea srcq, [src_bakq+wq] +%endif + xor dword r8m, 4 + add offxyd, 16 +%if ARCH_X86_32 + add dword [rsp+5*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + jmp .loop_x_odd_v_overlap + +.end_hv: + RET + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +INIT_XMM ssse3 +%if ARCH_X86_32 +; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, +; sby, luma, lstride, uv_pl, is_id) +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ + tmp, src, scaling, h, fg_data, picptr, unused + mov r0, r0m + mov r1, r2m + mov r2, r4m + mov r3, r6m + mov r4, r7m + mov [rsp+7*mmsize+3*gprsize], r0 + mov [rsp+7*mmsize+5*gprsize], r1 + mov [rsp+7*mmsize+7*gprsize], r2 + mov [rsp+7*mmsize+9*gprsize], r3 + mov [rsp+7*mmsize+10*gprsize], r4 + + mov r0, r8m + mov r1, r9m + mov r2, r10m + mov r4, r11m + mov r3, r12m + mov [rsp+7*mmsize+11*gprsize], r0 + mov [rsp+7*mmsize+12*gprsize], r1 + mov [rsp+7*mmsize+13*gprsize], r2 + mov [rsp+7*mmsize+14*gprsize], r4 +%else +cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ + tmp, src, scaling, h, fg_data, picptr, unused +%endif + mov srcq, srcm + mov fg_dataq, r3m + mov scalingq, r5m +%if STACK_ALIGNMENT < mmsize +%define r0m [rsp+7*mmsize+ 3*gprsize] +%define r1m [rsp+7*mmsize+ 4*gprsize] +%define r2m [rsp+7*mmsize+ 5*gprsize] +%define r3m [rsp+7*mmsize+ 6*gprsize] +%define r4m [rsp+7*mmsize+ 7*gprsize] +%define r5m [rsp+7*mmsize+ 8*gprsize] +%define r6m [rsp+7*mmsize+ 9*gprsize] +%define r7m [rsp+7*mmsize+10*gprsize] +%define r8m [rsp+7*mmsize+11*gprsize] +%define r9m [rsp+7*mmsize+12*gprsize] +%define r10m [rsp+7*mmsize+13*gprsize] +%define r11m [rsp+7*mmsize+14*gprsize] +%define r12m [rsp+7*mmsize+15*gprsize] +%endif + LEA r5, pb_mask +%define base r5-pb_mask + mov r5m, r5 +%else +cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, tmp, sby, luma, lstride, uv_pl, is_id + lea r8, [pb_mask] +%define base r8-pb_mask +%endif + mov r6d, [fg_dataq+FGData.scaling_shift] + movd m3, [base+mul_bits+r6*2-14] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + lea tmpd, [r6d*2] +%if ARCH_X86_32 && STACK_ALIGNMENT < mmsize + test r3, r3 +%else + cmp dword r12m, 0 ; is_idm +%endif + movd m5, [base+min+r6*2] + cmovne r6d, tmpd + movd m4, [base+max+r6*2] + punpcklwd m3, m3 + punpcklwd m5, m5 + punpcklwd m4, m4 + pshufd m3, m3, q0000 + pshufd m5, m5, q0000 + pshufd m4, m4, q0000 + SCRATCH 3, 11, 0 + SCRATCH 4, 12, 1 + SCRATCH 5, 13, 2 + + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap +%endif + +%if %1 + mov r6d, dword r11m + movd m0, [fg_dataq+FGData.uv_mult+r6*4] + movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] + punpcklbw m6, m1, m0 + movd m7, [fg_dataq+FGData.uv_offset+r6*4] + punpcklwd m6, m6 + punpcklwd m7, m7 + pshufd m6, m6, q0000 + pshufd m7, m7, q0000 + SCRATCH 6, 14, 3 + SCRATCH 7, 15, 4 +%endif + + mov sbyd, r8m + mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 + test overlapd, overlapd + jz %%no_vertical_overlap +%if ARCH_X86_32 +%if %2 + mova m1, [base+pb_23_22_h] +%else + mova m1, [base+pb_27_17_17_27] +%endif + mova m0, [base+pw_1024] +%else +%if %2 + mova m1, [pb_23_22_h] +%else + mova m1, [pb_27_17_17_27] +%endif + mova m0, [pw_1024] +%endif + SCRATCH 0, 8, 5 + SCRATCH 1, 9, 6 + test sbyd, sbyd + jnz %%vertical_overlap + ; fall-through + +%%no_vertical_overlap: + mov r8m, overlapd +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap + imul seed, (173 << 24) | 37 +%else + imul seed, sbyd, (173 << 24) | 37 +%endif + add seed, (105 << 24) | 178 + rol seed, 8 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak +%define luma_bakq lumaq + + mov wq, r4m +%if %3 + shl r10mp, 1 +%endif +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak + + mov lstrideq, r10mp +%endif + + mov lumaq, r9mp + lea src_bakq, [srcq+wq] + lea luma_bakq, [lumaq+wq*(1+%2)] + neg wq + sub r0mp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r11m, luma_bakq + mov r4m, wq + + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%else + mov r11mp, src_bakq + mov r12mp, strideq +%endif + +%%loop_x: +%if ARCH_X86_32 + mov seed, r3m +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, unused1, unused2, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, unused1, unused2, lstride, luma_bak +%endif + +%%loop_x_odd: + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y: + ; src +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq+ 0] + pcmpgtb m6, m2, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; we already incremented lumaq above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 + dec hw + jg %%loop_y + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 == 0 + ; adjust top_offxy +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jc %%loop_x_even + test dword r8m, 2 + jz %%loop_x_odd + jmp %%loop_x_odd_v_overlap +%%loop_x_even: +%endif + test dword r8m, 1 + jz %%loop_x + + ; r8m = sbym + test dword r8m, 2 + jne %%loop_x_hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: +%if ARCH_X86_32 +%if %2 + lea r6, [offxyd+16] + mov [rsp+7*mmsize+0*gprsize], r6 +%else + mov [rsp+7*mmsize+0*gprsize], offxyd +%endif + + DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut + + mov seed, r3m +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + +%if %2 + lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%else + mov left_offxyd, offyd +%endif +%endif + mov r6d, seed + or seed, 0xEFF4 + shr r6d, 1 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, unused1, unused2, lstride + + mov offyd, seed + mov offxd, seed +%endif + ror offyd, 8 + shr offxd, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%%loop_y_h_overlap: + ; src +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack chroma_source + punpckhbw m1, m0, m2 + punpcklbw m0, m2 ; m0-1: src as word + + ; grain = grain_lut[offy+y][offx+x] + movu m4, [grain_lutq+offxyq+ 0] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+0*gprsize] + movd m2, [grain_lutq+r0+ 0] +%else + movd m2, [grain_lutq+left_offxyq+ 0] +%endif + punpcklbw m2, m4 + pmaddubsw m3, m9, m2 + pmulhrsw m3, m8 + packsswb m3, m3 + shufps m3, m4, q3210 + pxor m4, m4 + pcmpgtb m4, m3 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m13 + pmaxsw m1, m13 + pminsw m0, m12 + pminsw m1, m12 + packuswb m0, m1 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has already been incremented above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 + dec hw + jg %%loop_y_h_overlap + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 == 0 + xor dword r8m, 4 + ; adjust top_offxyd +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 +%endif + + ; r8m = sbym + test dword r8m, 2 +%if %2 + jne %%loop_x_hv_overlap + jmp %%loop_x_h_overlap +%else + jne %%loop_x_odd_v_overlap + jmp %%loop_x_odd +%endif + +%%end: + RET + +%%vertical_overlap: +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap +%else + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap +%endif + + or overlapd, 2 ; top_overlap: overlap & 2 + mov r8m, overlapd + movzx sbyd, sbyb +%if ARCH_X86_32 + imul r4, [fg_dataq+FGData.seed], 0x00010001 + DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused +%else + imul seed, [fg_dataq+FGData.seed], 0x00010001 +%endif + imul tmpd, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add tmpd, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and tmpd, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, tmpd +%if ARCH_X86_32 + xor sbyd, seed ; (cur_seed << 16) | top_seed + + DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak + + mov r3m, seed + mov wq, r4m +%if %3 + shl r10mp, 1 +%endif +%else + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak + + mov lstrideq, r10mp +%endif + + mov lumaq, r9mp + lea src_bakq, [srcq+wq] + lea luma_bakq, [lumaq+wq*(1+%2)] + neg wq + sub r0mp, srcq +%if ARCH_X86_32 + mov r1m, src_bakq + mov r11m, luma_bakq + mov r4m, wq + + DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 +%else + mov r11mp, src_bakq + mov r12mp, strideq +%endif + +%%loop_x_v_overlap: +%if ARCH_X86_32 + mov seed, r3m + xor tmpd, tmpd +%endif + ; we assume from the block above that bits 8-15 of tmpd are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, overlap, top_offxy, unused, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak +%endif + + movzx top_offxyd, offxyw + shr offxyd, 16 +%if ARCH_X86_32 + mov [rsp+7*mmsize+1*gprsize], top_offxyd + + DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut +%endif + +%%loop_x_odd_v_overlap: + mov hd, r7m + mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %3 + mova m1, [PIC_ptr(pb_23_22)] +%else + mova m1, [PIC_ptr(pb_27_17)] +%endif +%%loop_y_v_overlap: +%if ARCH_X86_32 + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[luma_src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else + vpgatherdw m7, m4, scalingq-1, r12, r2 + vpgatherdw m5, m6, scalingq-1, r12, r2 +%endif + REPX {psrlw x, 8}, m7, m5 + + ; grain = grain_lut[offy+y][offx+x] + movu m3, [grain_lutq+offxyq] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+1*gprsize] + movu m4, [grain_lutq+r0] +%else + movu m4, [grain_lutq+top_offxyq] +%endif + punpckhbw m6, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, m1, m6 + pmaddubsw m3, m1, m4 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + packsswb m3, m2 + pxor m6, m6 + pcmpgtb m6, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m3, m5 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + + ; unpack chroma_source + pxor m4, m4 + punpckhbw m6, m0, m4 + punpcklbw m0, m4 ; m0-1: src as word + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m6, m3 + pmaxsw m0, m13 + pmaxsw m6, m13 + pminsw m0, m12 + pminsw m6, m12 + packuswb m0, m6 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + + dec hw + je %%end_y_v_overlap +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has already been incremented above +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif +%endif + add grain_lutq, 82 +%if %3 == 0 + btc hd, 16 +%if ARCH_X86_32 + mov r5, r5m +%endif + mova m1, [PIC_ptr(pb_17_27)] + jnc %%loop_y_v_overlap +%endif + jmp %%loop_y + +%%end_y_v_overlap: +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif + +%if %2 + ; since fg_dataq.overlap is guaranteed to be set, we never jump + ; back to .loop_x_v_overlap, and instead always fall-through to + ; h+v overlap +%else +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jnc %%loop_x_odd_v_overlap +%endif + +%%loop_x_hv_overlap: +%if ARCH_X86_32 + DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused + + mov r6, [rsp+7*mmsize+1*gprsize] +%if %2 + lea r0, [r3d+16] + add r6, 16 + mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy +%else + mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy +%endif + mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy + + DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused + + mov seed, r3m + xor tmpd, tmpd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride + +%if %2 + lea topleft_offxyq, [top_offxyq+16] + lea left_offxyq, [offxyq+16] +%else + mov topleft_offxyq, top_offxyq + mov left_offxyq, offxyq +%endif + + ; we assume from the block above that bits 8-15 of tmpd are zero'ed +%endif + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp tmpb ; parity of top_seed + shr seed, 16 + shl tmpd, 16 + test seeb, seeh + setp tmpb ; parity of cur_seed + or r6d, 0x00010001 + xor tmpd, r6d + mov seed, tmpd + ror seed, 1 ; updated (cur_seed << 16) | top_seed + +%if ARCH_X86_32 + mov r3m, seed + + DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx + + mov offxd, offyd +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride + + mov offxd, seed + mov offyd, seed +%endif + ror offyd, 8 + ror offxd, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + +%if ARCH_X86_32 + DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut +%else + DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ + h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak +%endif + + movzx top_offxyd, offxyw + shr offxyd, 16 +%if ARCH_X86_32 + mov [rsp+7*mmsize+1*gprsize], top_offxyd +%endif + + mov hd, r7m + mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m +%endif +%if %3 + mova m3, [PIC_ptr(pb_23_22)] +%else + mova m3, [PIC_ptr(pb_27_17)] +%endif +%%loop_y_hv_overlap: + ; grain = grain_lut[offy+y][offx+x] +%if ARCH_X86_32 + mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy + mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy + movd m1, [grain_lutq+r0] + mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy +%else + movd m1, [grain_lutq+topleft_offxyq] +%endif + movu m2, [grain_lutq+offxyq] +%if ARCH_X86_32 + movu m6, [grain_lutq+r5] + movd m4, [grain_lutq+r0] +%else + movu m6, [grain_lutq+top_offxyq] + movd m4, [grain_lutq+left_offxyq] +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) + punpcklbw m1, m6 + punpcklbw m4, m2 + pmaddubsw m0, m9, m1 + pmaddubsw m1, m9, m4 + REPX {pmulhrsw x, m8}, m0, m1 + packsswb m0, m1 + shufps m4, m0, m2, q3232 + shufps m0, m6, q3210 + ; followed by v interpolation (top | cur -> cur) + punpcklbw m2, m0, m4 + punpckhbw m0, m4 + pmaddubsw m4, m3, m0 + pmaddubsw m1, m3, m2 + pmulhrsw m4, m8 + pmulhrsw m1, m8 + packsswb m1, m4 + + ; src +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov lumaq, r9mp +%endif +%if %2 + mova m4, [lumaq+ 0] + mova m6, [lumaq+16] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq + mov r5, r5m + movd m7, [base+pb_1] +%else + movd m7, [pb_1] +%endif + pshufd m7, m7, q0000 + pxor m2, m2 + pmaddubsw m4, m7 + pmaddubsw m6, m7 + pavgw m4, m2 + pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif + +%if %1 +%if %2 + packuswb m4, m6 ; luma +%endif + punpckhbw m6, m4, m0 + punpcklbw m4, m0 ; { luma, chroma } + pmaddubsw m6, m14 + pmaddubsw m4, m14 + psraw m6, 6 + psraw m4, 6 + paddw m6, m15 + paddw m4, m15 + packuswb m4, m6 ; pack+unpack = clip + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 +%endif + + ; scaling[src] +%if ARCH_X86_32 + vpgatherdw m7, m4, scalingq-1, r0, r5 + vpgatherdw m5, m6, scalingq-1, r0, r5 +%else +%if %3 + vpgatherdw m7, m4, scalingq-1, r2, r12 + vpgatherdw m5, m6, scalingq-1, r2, r12 +%else + vpgatherdw m7, m4, scalingq-1, r2, r13 + vpgatherdw m5, m6, scalingq-1, r2, r13 +%endif +%endif + REPX {psrlw x, 8}, m7, m5 + + ; unpack grain + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + punpckhbw m1, m4 + + ; noise = round2(scaling[src] * grain, scaling_shift) + pmullw m2, m7 + pmullw m1, m5 + pmulhrsw m2, m11 + pmulhrsw m1, m11 + +%if ARCH_X86_32 + DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut +%endif + + ; unpack chroma source + pxor m4, m4 + punpckhbw m5, m0, m4 + punpcklbw m0, m4 ; m0-1: src as word + + ; dst = clip_pixel(src, noise) + paddw m0, m2 + paddw m5, m1 + pmaxsw m0, m13 + pmaxsw m5, m13 + pminsw m0, m12 + pminsw m5, m12 + packuswb m0, m5 + movifnidn dstq, dstmp + mova [dstq+srcq], m0 + +%if ARCH_X86_32 + add srcq, r2mp + ; lumaq has been adjusted above already +%else + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*(1+%2)] +%else + add lumaq, r10mp +%endif +%endif + add grain_lutq, 82 + dec hw +%if %3 + jg %%loop_y_h_overlap +%else + jle %%end_y_hv_overlap +%if ARCH_X86_32 + mov r5, r5m +%endif + mova m3, [PIC_ptr(pb_17_27)] + btc hd, 16 + jnc %%loop_y_hv_overlap +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif + jmp %%loop_y_h_overlap +%%end_y_hv_overlap: +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif +%endif + +%if ARCH_X86_32 + DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut + + mov wq, r4m +%endif + add wq, 16 + jge %%end_hv +%if ARCH_X86_32 + mov srcq, r1mp + mov lumaq, r11mp +%else + mov srcq, r11mp +%endif + lea lumaq, [luma_bakq+wq*(1+%2)] + add srcq, wq +%if ARCH_X86_32 + mov r4m, wq + mov r9m, lumaq +%endif +%if %2 + jmp %%loop_x_hv_overlap +%else +%if ARCH_X86_32 + add dword [rsp+7*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + xor dword r8m, 4 + jmp %%loop_x_odd_v_overlap +%endif + +%%end_hv: + RET +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 422, 1, 0 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 444, 0, 0 diff --git a/third_party/dav1d/src/x86/ipred.h b/third_party/dav1d/src/x86/ipred.h new file mode 100644 index 0000000000..415a4d8d62 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred.h @@ -0,0 +1,151 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/ipred.h" + +#define decl_fn(type, name) \ + decl_##type##_fn(BF(dav1d_##name, ssse3)); \ + decl_##type##_fn(BF(dav1d_##name, avx2)); \ + decl_##type##_fn(BF(dav1d_##name, avx512icl)) +#define init_fn(type0, type1, name, suffix) \ + c->type0[type1] = BF(dav1d_##name, suffix) + +#define init_angular_ipred_fn(type, name, suffix) \ + init_fn(intra_pred, type, name, suffix) +#define init_cfl_pred_fn(type, name, suffix) \ + init_fn(cfl_pred, type, name, suffix) +#define init_cfl_ac_fn(type, name, suffix) \ + init_fn(cfl_ac, type, name, suffix) + +decl_fn(angular_ipred, ipred_dc); +decl_fn(angular_ipred, ipred_dc_128); +decl_fn(angular_ipred, ipred_dc_top); +decl_fn(angular_ipred, ipred_dc_left); +decl_fn(angular_ipred, ipred_h); +decl_fn(angular_ipred, ipred_v); +decl_fn(angular_ipred, ipred_paeth); +decl_fn(angular_ipred, ipred_smooth); +decl_fn(angular_ipred, ipred_smooth_h); +decl_fn(angular_ipred, ipred_smooth_v); +decl_fn(angular_ipred, ipred_z1); +decl_fn(angular_ipred, ipred_z2); +decl_fn(angular_ipred, ipred_z3); +decl_fn(angular_ipred, ipred_filter); + +decl_fn(cfl_pred, ipred_cfl); +decl_fn(cfl_pred, ipred_cfl_128); +decl_fn(cfl_pred, ipred_cfl_top); +decl_fn(cfl_pred, ipred_cfl_left); + +decl_fn(cfl_ac, ipred_cfl_ac_420); +decl_fn(cfl_ac, ipred_cfl_ac_422); +decl_fn(cfl_ac, ipred_cfl_ac_444); + +decl_fn(pal_pred, pal_pred); + +static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3); + init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3); + init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3); + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3); +#if BITDEPTH == 8 + init_angular_ipred_fn(Z1_PRED, ipred_z1, ssse3); + init_angular_ipred_fn(Z2_PRED, ipred_z2, ssse3); + init_angular_ipred_fn(Z3_PRED, ipred_z3, ssse3); +#endif + init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3); + + init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3); + init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3); + init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3); + init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3); + + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3); + + c->pal_pred = BF(dav1d_pal_pred, ssse3); + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + init_angular_ipred_fn(DC_PRED, ipred_dc, avx2); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2); + init_angular_ipred_fn(HOR_PRED, ipred_h, avx2); + init_angular_ipred_fn(VERT_PRED, ipred_v, avx2); + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2); + init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2); + init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2); + init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2); + + init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2); + init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2); + init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2); + init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2); + + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2); + + c->pal_pred = BF(dav1d_pal_pred, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + +#if BITDEPTH == 8 + init_angular_ipred_fn(DC_PRED, ipred_dc, avx512icl); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx512icl); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx512icl); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl); + init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); + init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); +#endif + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl); + + c->pal_pred = BF(dav1d_pal_pred, avx512icl); +#endif +} diff --git a/third_party/dav1d/src/x86/ipred16_avx2.asm b/third_party/dav1d/src/x86/ipred16_avx2.asm new file mode 100644 index 0000000000..7ddb189916 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred16_avx2.asm @@ -0,0 +1,4992 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 64 + +%macro SMOOTH_WEIGHTS 1-* +const smooth_weights_1d_16bpc ; sm_weights[] << 7 + %rep %0 + dw %1*128 + %rotate 1 + %endrep +const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[] + %rep %0 + dw %1, 256-%1 + %rotate 1 + %endrep +%endmacro + +SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +%if ARCH_X86_64 + +ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 + db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 +filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 +filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 +filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 +pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 + dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 +pw_m1024: times 2 dw -1024 +pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 +z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4 +z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8 +pb_90: times 4 db 90 +z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4 +z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11 +z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9 +z_filter_k: dw 4, 4, 5, 5, 4, 4 + dw 8, 8, 6, 6, 4, 4 + dw 0, 0, 0, 0, 2, 2 + +%define pw_2 (z_filter_k+32) +%define pw_4 (z_filter_k+ 0) +%define pw_16 (z2_ymul8 +20) + +pw_1: times 2 dw 1 +pw_3: times 2 dw 3 +pw_62: times 2 dw 62 +pw_512: times 2 dw 512 +pw_2048: times 2 dw 2048 +pd_8: dd 8 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) +%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4) + +JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 +JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 +JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32 +JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 + +cextern dr_intra_derivative +cextern filter_intra_taps + +SECTION .text + +INIT_YMM avx2 +cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h + movifnidn hd, hm + add tlq, 2 + movd xm4, wd + pxor xm3, xm3 + pavgw xm4, xm3 + tzcnt wd, wd + movd xm5, wd + movu m0, [tlq] + lea r5, [ipred_dc_left_16bpc_avx2_table] + movsxd r6, [r5+wq*4] + add r6, r5 + add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + mov hd, hm + sub tlq, hq + movd xm4, hd + sub tlq, hq + pxor xm3, xm3 + pavgw xm4, xm3 + tzcnt r6d, hd + movd xm5, r6d + movu m0, [tlq] + lea r5, [ipred_dc_left_16bpc_avx2_table] + movsxd r6, [r5+r6*4] + add r6, r5 + add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + paddw m0, [tlq+96] + paddw m0, [tlq+64] +.h32: + paddw m0, [tlq+32] +.h16: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h8: + psrldq xm1, xm0, 8 + paddw xm0, xm1 +.h4: + punpcklwd xm0, xm3 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + paddd xm0, xm4 + psrld xm0, xm5 + lea stride3q, [strideq*3] + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp wq + +cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd xm4, r5d + tzcnt r5d, r5d + movd xm5, r5d + lea r5, [ipred_dc_16bpc_avx2_table] + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pxor m3, m3 + psrlw xm4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movq xm0, [tlq-8] + jmp wq +.w4: + movq xm1, [tlq+2] + paddw m0, m4 + paddw m0, m1 + psrlq m1, m0, 32 + paddw m0, m1 + psrld m1, m0, 16 + paddw m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + lea r2d, [hq*2] + mov r6d, 0xAAAB6667 + shrx r6d, r6d, r2d + punpckhwd xm1, xm0, xm3 + punpcklwd xm0, xm3 + paddd xm0, xm1 + movd xm1, r6d + psrld xm0, 2 + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w4_end: + vpbroadcastw xm0, xm0 +.s4: + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 + movq [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +ALIGN function_align +.h8: + mova xm0, [tlq-16] + jmp wq +.w8: + vextracti128 xm1, m0, 1 + paddw xm0, [tlq+2] + paddw xm0, xm4 + paddw xm0, xm1 + psrld xm1, xm0, 16 + paddw xm0, xm1 + pblendw xm0, xm3, 0xAA + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w8_end: + vpbroadcastw xm0, xm0 +.s8: + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +ALIGN function_align +.h16: + mova m0, [tlq-32] + jmp wq +.w16: + paddw m0, [tlq+2] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpckhwd xm1, xm0, xm3 + punpcklwd xm0, xm3 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w16_end: + vpbroadcastw m0, xm0 +.s16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +ALIGN function_align +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-32] + jmp wq +.w32: + paddw m0, [tlq+ 2] + paddw m0, [tlq+34] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm3 + punpckhwd xm0, xm3 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x6667AAAB + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w32_end: + vpbroadcastw m0, xm0 + mova m1, m0 +.s32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m0 + mova [dstq+strideq*2+32*1], m1 + mova [dstq+stride3q +32*0], m0 + mova [dstq+stride3q +32*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +ALIGN function_align +.h64: + mova m0, [tlq-128] + mova m1, [tlq- 96] + paddw m0, [tlq- 64] + paddw m1, [tlq- 32] + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 2] + paddw m0, [tlq+34] + paddw m1, [tlq+66] + paddw m0, [tlq+98] + paddw m0, m1 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm3 + punpckhwd xm0, xm3 + paddd xm1, xm4 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 64 + je .w64_end + mov r6d, 0x6667AAAB + shrx r6d, r6d, hd + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w64_end: + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s64: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*0+32*2], m2 + mova [dstq+strideq*0+32*3], m3 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m2 + mova [dstq+strideq*1+32*3], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s64 + RET + +cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + mov r6d, r8m + shr r6d, 11 + lea r5, [ipred_dc_splat_16bpc_avx2_table] + tzcnt wd, wd + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4] + mova m1, m0 + mova m2, m0 + mova m3, m0 + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movu m0, [tlq+ 2] + movu m1, [tlq+34] + movu m2, [tlq+66] + movu m3, [tlq+98] + lea r5, [ipred_dc_splat_16bpc_avx2_table] + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +%macro IPRED_H 2 ; w, store_type + vpbroadcastw m0, [tlq-2] + vpbroadcastw m1, [tlq-4] + vpbroadcastw m2, [tlq-6] + vpbroadcastw m3, [tlq-8] + sub tlq, 8 + mov%2 [dstq+strideq*0], m0 + mov%2 [dstq+strideq*1], m1 + mov%2 [dstq+strideq*2], m2 + mov%2 [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w%1 + RET +ALIGN function_align +%endmacro + +cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + lea r5, [ipred_h_16bpc_avx2_table] + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +INIT_XMM avx2 +.w4: + IPRED_H 4, q +.w8: + IPRED_H 8, a +INIT_YMM avx2 +.w16: + IPRED_H 16, a +.w32: + vpbroadcastw m0, [tlq-2] + vpbroadcastw m1, [tlq-4] + vpbroadcastw m2, [tlq-6] + vpbroadcastw m3, [tlq-8] + sub tlq, 8 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m0 + mova [dstq+strideq*1+32*0], m1 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m2 + mova [dstq+strideq*2+32*1], m2 + mova [dstq+stride3q +32*0], m3 + mova [dstq+stride3q +32*1], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32 + RET +.w64: + vpbroadcastw m0, [tlq-2] + vpbroadcastw m1, [tlq-4] + sub tlq, 4 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m0 + mova [dstq+strideq*0+32*2], m0 + mova [dstq+strideq*0+32*3], m0 + mova [dstq+strideq*1+32*0], m1 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m1 + mova [dstq+strideq*1+32*3], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w64 + RET + +%macro PAETH 3 ; top, signed_ldiff, ldiff + paddw m0, m%2, m1 + psubw m7, m3, m0 ; tldiff + psubw m0, m%1 ; tdiff + pabsw m7, m7 + pabsw m0, m0 + pminsw m7, m0 + pcmpeqw m0, m7 + pcmpgtw m7, m%3, m7 + vpblendvb m0, m3, m%1, m0 + vpblendvb m0, m1, m0, m7 +%endmacro + +cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h +%define base r5-ipred_paeth_16bpc_avx2_table + movifnidn hd, hm + lea r5, [ipred_paeth_16bpc_avx2_table] + tzcnt wd, wd + movsxd wq, [r5+wq*4] + vpbroadcastw m3, [tlq] ; topleft + add wq, r5 + jmp wq +.w4: + vpbroadcastq m2, [tlq+2] ; top + movsldup m6, [base+ipred_hv_shuf] + lea r3, [strideq*3] + psubw m4, m2, m3 + pabsw m5, m4 +.w4_loop: + sub tlq, 8 + vpbroadcastq m1, [tlq] + pshufb m1, m6 ; left + PAETH 2, 4, 5 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + vbroadcasti128 m2, [tlq+2] + movsldup m6, [base+ipred_hv_shuf] + psubw m4, m2, m3 + pabsw m5, m4 +.w8_loop: + sub tlq, 4 + vpbroadcastd m1, [tlq] + pshufb m1, m6 + PAETH 2, 4, 5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + movu m2, [tlq+2] + psubw m4, m2, m3 + pabsw m5, m4 +.w16_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m2, [tlq+2] + movu m6, [tlq+34] +%if WIN64 + movaps r4m, xmm8 + movaps r6m, xmm9 +%endif + psubw m4, m2, m3 + psubw m8, m6, m3 + pabsw m5, m4 + pabsw m9, m8 +.w32_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq+32*0], m0 + PAETH 6, 8, 9 + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w32_loop +%if WIN64 + movaps xmm8, r4m + movaps xmm9, r6m +%endif + RET +ALIGN function_align +.w64: + WIN64_SPILL_XMM 16 + movu m2, [tlq+ 2] + movu m6, [tlq+34] + movu m10, [tlq+66] + movu m13, [tlq+98] + psubw m4, m2, m3 + psubw m8, m6, m3 + psubw m11, m10, m3 + psubw m14, m13, m3 + pabsw m5, m4 + pabsw m9, m8 + pabsw m12, m11 + pabsw m15, m14 +.w64_loop: + sub tlq, 2 + vpbroadcastw m1, [tlq] + PAETH 2, 4, 5 + mova [dstq+32*0], m0 + PAETH 6, 8, 9 + mova [dstq+32*1], m0 + PAETH 10, 11, 12 + mova [dstq+32*2], m0 + PAETH 13, 14, 15 + mova [dstq+32*3], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights +%define base r6-ipred_smooth_v_16bpc_avx2_table + lea r6, [ipred_smooth_v_16bpc_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] + neg hq + vpbroadcastw m5, [tlq+hq*2] ; bottom + add wq, r6 + jmp wq +.w4: + vpbroadcastq m4, [tlq+2] ; top + movsldup m3, [base+ipred_hv_shuf] + lea r6, [strideq*3] + psubw m4, m5 ; top - bottom +.w4_loop: + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m3 + pmulhrsw m0, m4 + paddw m0, m5 + vextracti128 xm1, m0, 1 + movhps [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movq [dstq+r6 ], xm0 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w4_loop +.ret: + RET +.w8: + vbroadcasti128 m4, [tlq+2] + movsldup m3, [base+ipred_hv_shuf] + lea r6, [strideq*3] + psubw m4, m5 +.w8_loop: + vpbroadcastd m0, [weightsq+hq*2+0] + vpbroadcastd m1, [weightsq+hq*2+4] + pshufb m0, m3 + pshufb m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + vextracti128 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], xm0 + vextracti128 [dstq+strideq*2], m1, 1 + mova [dstq+r6 ], xm1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +.w16: + movu m4, [tlq+2] + lea r6, [strideq*3] + psubw m4, m5 +.w16_loop: + vpbroadcastw m0, [weightsq+hq*2+0] + vpbroadcastw m1, [weightsq+hq*2+2] + vpbroadcastw m2, [weightsq+hq*2+4] + vpbroadcastw m3, [weightsq+hq*2+6] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r6 ], m3 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w16_loop + RET +.w32: + WIN64_SPILL_XMM 7 + movu m4, [tlq+ 2] + movu m6, [tlq+34] + psubw m4, m5 + psubw m6, m5 +.w32_loop: + vpbroadcastw m1, [weightsq+hq*2+0] + vpbroadcastw m3, [weightsq+hq*2+2] + pmulhrsw m0, m4, m1 + pmulhrsw m1, m6 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m6 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w32_loop + RET +.w64: + WIN64_SPILL_XMM 8 + movu m3, [tlq+ 2] + movu m4, [tlq+34] + movu m6, [tlq+66] + movu m7, [tlq+98] + REPX {psubw x, m5}, m3, m4, m6, m7 +.w64_loop: + vpbroadcastw m2, [weightsq+hq*2] + pmulhrsw m0, m3, m2 + pmulhrsw m1, m4, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*0], m0 + pmulhrsw m0, m6, m2 + mova [dstq+32*1], m1 + pmulhrsw m1, m7, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + add dstq, strideq + inc hq + jl .w64_loop + RET + +cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 +%define base r6-ipred_smooth_h_16bpc_avx2_table + lea r6, [ipred_smooth_h_16bpc_avx2_table] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m5, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [r6+wq*4] + sub tlq, hq + lea stride3q, [strideq*3] + add wq, r6 + jmp wq +.w4: + vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2] + movsldup m3, [base+ipred_hv_shuf] +.w4_loop: + vpbroadcastq m0, [tlq+hq-8] ; left + pshufb m0, m3 + psubw m0, m5 ; left - right + pmulhrsw m0, m4 + paddw m0, m5 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w4_loop + RET +.w8: + vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2] + movsldup m3, [base+ipred_hv_shuf] +.w8_loop: + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastd m1, [tlq+hq-8] + pshufb m0, m3 + pshufb m1, m3 + psubw m0, m5 + psubw m1, m5 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w8_loop + RET +.w16: + movu m4, [base+smooth_weights_1d_16bpc+16*2] +.w16_loop: + vpbroadcastq m3, [tlq+hq-8] + punpcklwd m3, m3 + psubw m3, m5 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w16_loop + RET +.w32: + WIN64_SPILL_XMM 7 + movu m4, [base+smooth_weights_1d_16bpc+32*2] + movu m6, [base+smooth_weights_1d_16bpc+32*3] +.w32_loop: + vpbroadcastw m1, [tlq+hq-2] + vpbroadcastw m3, [tlq+hq-4] + psubw m1, m5 + psubw m3, m5 + pmulhrsw m0, m4, m1 + pmulhrsw m1, m6 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m6 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w32_loop + RET +.w64: + WIN64_SPILL_XMM 8 + movu m3, [base+smooth_weights_1d_16bpc+32*4] + movu m4, [base+smooth_weights_1d_16bpc+32*5] + movu m6, [base+smooth_weights_1d_16bpc+32*6] + movu m7, [base+smooth_weights_1d_16bpc+32*7] +.w64_loop: + vpbroadcastw m2, [tlq+hq-2] + psubw m2, m5 + pmulhrsw m0, m3, m2 + pmulhrsw m1, m4, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*0], m0 + pmulhrsw m0, m6, m2 + mova [dstq+32*1], m1 + pmulhrsw m1, m7, m2 + paddw m0, m5 + paddw m1, m5 + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + add dstq, strideq + sub hq, 1*2 + jg .w64_loop + RET + +%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] + pmaddwd m0, m%1, m%3 + pmaddwd m1, m%2, m%4 + paddd m0, m%5 + paddd m1, m%6 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pavgw m0, m5 +%endmacro + +cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights +%define base r6-ipred_smooth_16bpc_avx2_table + lea r6, [ipred_smooth_16bpc_avx2_table] + mov wd, wm + vpbroadcastw m4, [tlq+wq*2] ; right + tzcnt wd, wd + mov hd, hm + sub tlq, hq + sub tlq, hq + movsxd wq, [r6+wq*4] + pxor m5, m5 + add wq, r6 + lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4] + jmp wq +.w4: + WIN64_SPILL_XMM 11 + vpbroadcastw m0, [tlq] ; bottom + vpbroadcastq m6, [tlq+hq*2+2] + movsldup m7, [base+ipred_hv_shuf] + movshdup m9, [base+ipred_hv_shuf] + vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4] + punpcklwd m6, m0 ; top, bottom + punpcklqdq m8, m9, m9 + punpckhqdq m9, m9 + lea r3, [strideq*3] +.w4_loop: + vpbroadcastq m3, [tlq+hq*2-8] + vbroadcasti128 m1, [v_weightsq] + pshufb m3, m7 + punpcklwd m2, m3, m4 ; left, right + punpckhwd m3, m4 + pmaddwd m2, m10 + pmaddwd m3, m10 + pshufb m0, m1, m8 + pshufb m1, m9 + SMOOTH_2D_END 0, 1, 6, 6, 2, 3 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + add v_weightsq, 16 + sub hd, 4 + jg .w4_loop + RET +.w8: +%assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 + vpbroadcastw m0, [tlq] ; bottom + vbroadcasti128 m7, [tlq+hq*2+2] + movsldup m8, [base+ipred_hv_shuf] + movshdup m9, [base+ipred_hv_shuf] + vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0] + vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1] + punpcklwd m6, m7, m0 ; top, bottom + punpckhwd m7, m0 +.w8_loop: + vpbroadcastd m3, [tlq+hq*2-4] + vpbroadcastq m1, [v_weightsq] + pshufb m3, m8 + punpcklwd m2, m3, m4 ; left, right + punpckhwd m3, m4 + pmaddwd m2, m10 + pmaddwd m3, m11 + pshufb m1, m9 + SMOOTH_2D_END 1, 1, 6, 7, 2, 3 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + add v_weightsq, 8 + sub hd, 2 + jg .w8_loop + RET +.w16: +%assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 11 + vpbroadcastw m0, [tlq] ; bottom + movu m7, [tlq+hq*2+2] + mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0] + mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1] + vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1 + vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1 + punpcklwd m6, m7, m0 ; top, bottom + punpckhwd m7, m0 +.w16_loop: + vpbroadcastd m3, [tlq+hq*2-4] + vpbroadcastd m1, [v_weightsq+0] + punpcklwd m3, m4 ; left, right + pshufd m2, m3, q1111 + pmaddwd m10, m8, m2 + pmaddwd m2, m9 + pshufd m3, m3, q0000 + SMOOTH_2D_END 1, 1, 6, 7, 10, 2 + vpbroadcastd m1, [v_weightsq+4] + pmaddwd m2, m8, m3 + pmaddwd m3, m9 + mova [dstq+strideq*0], m0 + SMOOTH_2D_END 1, 1, 6, 7, 2, 3 + mova [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + add v_weightsq, 8 + sub hq, 2 + jg .w16_loop + RET +.w32: +%assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 15 + vpbroadcastw m0, [tlq] ; bottom + movu m7, [tlq+hq*2+ 2] + movu m9, [tlq+hq*2+34] + mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0] + mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1] + vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1 + vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1 + mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4] + mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5] + vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1 + vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1 + punpcklwd m6, m7, m0 + punpckhwd m7, m0 + punpcklwd m8, m9, m0 + punpckhwd m9, m0 +.w32_loop: + vpbroadcastw m3, [tlq+hq*2-2] + vpbroadcastd m14, [v_weightsq] + punpcklwd m3, m4 + pmaddwd m1, m10, m3 + pmaddwd m2, m11, m3 + pmaddwd m0, m6, m14 + paddd m0, m1 + pmaddwd m1, m7, m14 + paddd m1, m2 + pmaddwd m2, m12, m3 + pmaddwd m3, m13 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pavgw m0, m5 + mova [dstq+32*0], m0 + SMOOTH_2D_END 14, 14, 8, 9, 2, 3 + mova [dstq+32*1], m0 + add dstq, strideq + add v_weightsq, 4 + dec hd + jg .w32_loop + RET +.w64: +%assign stack_offset stack_offset - stack_size_padded + PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base + mov dst_baseq, dstq + mov tl_baseq, tlq + mov v_weights_baseq, v_weightsq + xor xq, xq +.w64_loop_x: + mov yq, hq + lea tlq, [tl_baseq+hq*2] + vpbroadcastw m0, [tl_baseq] ; bottom + movu m7, [tlq+xq*2+ 2] + movu m9, [tlq+xq*2+34] + mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0] + mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1] + vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1 + vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1 + mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4] + mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5] + vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1 + vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1 + punpcklwd m6, m7, m0 + punpckhwd m7, m0 + punpcklwd m8, m9, m0 + punpckhwd m9, m0 + lea tlq, [tl_baseq-2] +.w64_loop_y: + vpbroadcastw m3, [tlq+yq*2] + vpbroadcastd m1, [v_weightsq] + punpcklwd m3, m4 + pmaddwd m14, m10, m3 + pmaddwd m15, m11, m3 + pmaddwd m2, m12, m3 + pmaddwd m3, m13 + pmaddwd m0, m6, m1 + paddd m0, m14 + pmaddwd m14, m7, m1 + paddd m14, m15 + psrld m0, 8 + psrld m14, 8 + packssdw m0, m14 + pavgw m0, m5 + mova [dstq+32*0], m0 + SMOOTH_2D_END 8, 9, 1, 1, 2, 3 + mova [dstq+32*1], m0 + add dstq, strideq + add v_weightsq, 4 + dec yq + jg .w64_loop_y + lea dstq, [dst_baseq+32*2] + add r6, 16*8 + mov v_weightsq, v_weights_baseq + add xq, 32 + test xb, 64 + jz .w64_loop_x + RET + +cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z1_16bpc_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea r7, [dr_intra_derivative] + movsxd wq, [r6+wq*4] + add tlq, 2 + add wq, r6 + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [r7+dxq] + xor angled, 0x4ff ; d = 90 - angle + vpbroadcastd m5, [pw_62] + jmp wq +.w4: + ALLOC_STACK -64, 7 + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + vpbroadcastw xm3, [tlq+14] + movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 + palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8 + paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 + add dxd, dxd + palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8 + paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d + psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 + psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 + pxor xm4, xm4 + paddw xm2, xm0 + vpbroadcastw xm0, r8m ; pixel_max + mova [rsp+32], xm3 + movd xm3, dxd + pmaxsw xm2, xm4 + mov r3d, dxd + pavgw xm2, xm4 + vpbroadcastw m3, xm3 + pminsw xm2, xm0 + punpcklwd xm0, xm1, xm2 + punpckhwd xm1, xm2 + lea r5, [strideq*3] + pslldq m2, m3, 8 + mova [rsp+ 0], xm0 + mova [rsp+16], xm1 + paddw m6, m3, m3 + paddw m3, m2 + vpblendd m4, m6, 0xf0 + paddw m6, m6 + paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3 + vbroadcasti128 m4, [z_upsample] +.w4_upsample_loop: + lea r2d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [rsp+r3*2] + lea r3d, [r2+dxq] + shr r2d, 6 ; base1 + movu xm2, [rsp+r2*2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base2 + vinserti128 m1, [rsp+r3*2], 1 ; 0 2 + lea r3d, [r2+dxq] + shr r2d, 6 ; base3 + vinserti128 m2, [rsp+r2*2], 1 ; 1 3 + pshufb m1, m4 + pshufb m2, m4 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pand m2, m5, m3 ; frac + psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 + psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) + pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) + paddw m3, m6 ; xpos += dx + paddw m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r5 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; w4/w8/w16 +%define base r3-z_filter_t0 + movd xm0, maxbased + lea r3, [z_filter_t0] + movd xm1, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m1, xm1 + pcmpeqb m0, [base+z_filter_wh] + mova xm2, [r3+angleq*8] + pand m0, m1 + pcmpgtb m0, m2 + pmovmskb r5d, m0 + ret +.w4_no_upsample: + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + lea maxbased, [hq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .w4_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastw xm3, [tlq+14] + mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 + vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] + palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8 + pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 + paddw xm2, xm0 + pmullw xm2, xm4 + movd [rsp+16], xm3 + cmp r5d, 3 + jne .w4_3tap + paddw xm1, xm2 + palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8 + pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 + movzx r3d, word [tlq+14] + movzx r2d, word [tlq+12] + inc maxbased + paddw xm2, xm0 + sub r2d, r3d + paddw xm2, xm2 + lea r2d, [r2+r3*8+4] + shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3 + mov [rsp+16], r2w +.w4_3tap: + pxor xm0, xm0 + paddw xm1, xm2 + mov tlq, rsp + psrlw xm1, 3 + cmp hd, 8 + sbb maxbased, -1 + pavgw xm0, xm1 + mova [tlq], xm0 +.w4_main: + movd xm3, dxd + vpbroadcastq m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x] + shl maxbased, 6 + vpbroadcastw m3, xm3 + movd xm0, maxbased + mov r3d, dxd ; xpos + vpbroadcastw m0, xm0 + paddw m4, m3, m3 + psubw m1, m0 ; -max_base_x + vpblendd m3, m4, 0xcc + paddw m0, m4, m3 + vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3 + paddw m4, m4 + paddw m3, m1 +.w4_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [tlq+r3*2] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + movu xm2, [tlq+r5*2] + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + vinserti128 m1, [tlq+r3*2], 1 ; 0 2 + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + vinserti128 m2, [tlq+r5*2], 1 ; 1 3 + punpcklqdq m0, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + vpblendd m1, m2, 0xcc + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 ; xpos < max_base_x + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w4_loop + lea r6, [strideq*3] +.w4_end_loop: + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + movq [dstq+strideq*2], xm6 + movq [dstq+r6 ], xm6 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_end_loop +.w4_end: + RET +.w8: + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 7 + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _ + movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _ + movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + cmp hd, 4 + jne .w8_upsample_h8 ; awkward single-pixel edge case + vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _ +.w8_upsample_h8: + paddw m2, m1 + paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + add dxd, dxd + psubw m0, m2, m0 + psraw m0, 3 + pxor m4, m4 + paddw m2, m0 + vpbroadcastw m0, r8m + movd xm3, dxd + pmaxsw m2, m4 + mov r3d, dxd + pavgw m2, m4 + vpbroadcastw m3, xm3 + pminsw m2, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + vbroadcasti128 m4, [z_upsample] + mova [rsp+ 0], xm0 + mova [rsp+16], xm1 + paddw m6, m3, m3 + vextracti128 [rsp+32], m0, 1 + vextracti128 [rsp+48], m1, 1 + vpblendd m3, m6, 0xf0 ; xpos0 xpos1 +.w8_upsample_loop: + lea r2d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm1, [rsp+r3*2] + movu xm2, [rsp+r3*2+16] + lea r3d, [r2+dxq] + shr r2d, 6 ; base1 + vinserti128 m1, [rsp+r2*2], 1 + vinserti128 m2, [rsp+r2*2+16], 1 + pshufb m1, m4 + pshufb m2, m4 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m3, m6 + paddw m0, m1 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_upsample_loop + RET +.w8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(h+7, 15) + jmp .w8_main +.w8_no_upsample: + lea maxbased, [hq+7] + test angled, 0x400 + jnz .w8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w8_main + popcnt r5d, r5d + vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m2 + cmp hd, 8 + jl .w8_filter_h4 + punpckhwd m2, m2 + vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + je .w8_filter_end ; 8x4 and 8x8 are always 3-tap + movzx r3d, word [tlq+30] + mov maxbased, 16 + mov [rsp+32], r3d + cmp r5d, 3 + jne .w8_filter_end + punpcklwd xm6, xm0, xm0 + vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g + vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movzx r5d, word [tlq+28] + mov [rsp+34], r3w + paddw m2, m6 + sub r5d, r3d + inc maxbased + paddw m2, m2 + lea r3d, [r5+r3*8+4] + paddw m1, m2 + shr r3d, 3 + mov [rsp+32], r3w + jmp .w8_filter_end +.w8_filter_h4: + pshuflw m3, m2, q3321 + vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _ +.w8_filter_end: + paddw m0, m3 + pmullw m0, m4 + mov tlq, rsp + pxor m2, m2 + paddw m0, m1 + psrlw m0, 3 + pavgw m0, m2 + mova [tlq], m0 +.w8_main: + movd xm3, dxd + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m3, xm3 + movd xm0, maxbased + mov r3d, dxd + vpbroadcastw m0, xm0 + paddw m4, m3, m3 + psubw m1, m0 + vpblendd m3, m4, 0xf0 ; xpos0 xpos1 + paddw m3, m1 +.w8_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + movu xm0, [tlq+r3*2] + movu xm1, [tlq+r3*2+2] + lea r3d, [r5+dxq] + shr r5d, 6 + vinserti128 m0, [tlq+r5*2], 1 + vinserti128 m1, [tlq+r5*2+2], 1 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w8_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w8_loop +.w8_end_loop: + mova [dstq+strideq*0], xm6 + mova [dstq+strideq*1], xm6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_end_loop +.w8_end: + RET +.w16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(h+15, 31) + jmp .w16_main +.w16: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 7 + lea maxbased, [hq+15] + test angled, 0x400 + jnz .w16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w16_main + popcnt r5d, r5d + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + cmp r5d, 3 + jne .w16_filter_3tap + vpbroadcastd m2, [base+pw_3] + punpcklwd xm0, xm0 + vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m0, m2 + pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m0, m1 + psrlw m0, 2 + movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + cmp hd, 8 + jl .w16_filter_5tap_h4 + punpckhwd m3, m3 + je .w16_filter_5tap_h8 + vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + movzx r3d, word [tlq+62] + movzx r2d, word [tlq+60] + pavgw m2, m4 + sub r2d, r3d + paddw m1, m3 + lea r2d, [r2+r3*8+4] + paddw m1, m2 + shr r2d, 3 + psrlw m1, 2 + mov [rsp+66], r3w + mov [rsp+64], r2w + mov tlq, rsp + mov r3d, 33 + cmp hd, 16 + cmovg maxbased, r3d + jmp .w16_filter_end2 +.w16_filter_5tap_h8: + vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 + vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 + pavgw xm2, xm4 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 + jmp .w16_filter_end2 +.w16_filter_5tap_h4: + pshuflw xm4, xm3, q3332 ; 4 5 5 5 + pshuflw xm3, xm3, q3321 ; 3 4 5 5 + pavgw xm2, xm4 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 + jmp .w16_filter_end2 +.w16_filter_3tap: + vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m4 + pmullw m3, m2 + paddw m0, m1 + cmp hd, 8 + je .w16_filter_3tap_h8 + jl .w16_filter_3tap_h4 + punpckhwd m2, m2 + vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + jmp .w16_filter_end +.w16_filter_3tap_h4: + pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _ + jmp .w16_filter_end +.w16_filter_3tap_h8: + psrldq xm2, 2 + pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8 +.w16_filter_end: + paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + pmullw m2, m4 + psrlw m0, 3 + pxor m1, m1 + paddw m2, m3 + psrlw m2, 3 + pavgw m0, m1 + pavgw m1, m2 +.w16_filter_end2: + mov tlq, rsp + mova [tlq+ 0], m0 + mova [tlq+32], m1 +.w16_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r3d, dxd + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + psubw m3, m0 +.w16_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + lea r3d, [r5+dxq] + shr r5d, 6 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m1, m0 + movu m0, [tlq+r5*2] + vpblendvb m2, m6, m1, m2 + movu m1, [tlq+r5*2+2] + mova [dstq+strideq*0], m2 + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+strideq*1], m0 + sub hd, 2 + jz .w16_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w16_loop +.w16_end_loop: + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_end_loop +.w16_end: + RET +.w32: + %assign stack_offset org_stack_offset + ALLOC_STACK -160, 8 + lea maxbased, [hq+31] + mov r3d, 63 + cmp hd, 32 + cmova maxbased, r3d + test angled, 0x400 + jnz .w32_main + vpbroadcastd m2, [pw_3] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m1, m2 + paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + mov r3, rsp + paddw m0, m1 + lea r5d, [maxbaseq-31] + psrlw m0, 2 + mova [r3], m0 +.w32_filter_loop: + mova m0, [tlq+30] + paddw m1, m2, [tlq+28] + add tlq, 32 + paddw m0, [tlq+0] + pavgw m1, [tlq+4] + paddw m0, [tlq+2] + add r3, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r3], m0 + sub r5d, 16 + jg .w32_filter_loop + movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + punpckhwd m1, m0, m0 + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + jl .w32_filter_h8 + vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + movzx r5d, word [tlq+62] + movzx r2d, word [tlq+60] + pavgw m2, m3 + sub r2d, r5d + paddw m0, m1 + lea r2d, [r2+r5*8+4] + paddw m0, m2 + shr r2d, 3 + psrlw m0, 2 + mova [r3+32], m0 + mov [r3+66], r5w + mov [r3+64], r2w + mov tlq, rsp + mov r3d, 65 + cmp hd, 64 + cmove maxbased, r3d + jmp .w32_main +.w32_filter_h8: + vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 + vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 + pavgw xm2, xm3 + paddw xm0, xm1 + mov tlq, rsp + paddw xm0, xm2 + psrlw xm0, 2 + mova [r3+32], xm0 +.w32_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r5d, dxd + vpbroadcastd m7, [pw_m1024] ; -16 * 64 + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + psubw m3, m0 +.w32_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + psraw m1, m3, 15 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*0], m0 + movu m0, [tlq+r3*2+32] + movu m1, [tlq+r3*2+34] + add r5d, dxd + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m7, m3 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+32*1], m0 + dec hd + jz .w32_end + add dstq, strideq + cmp r5d, maxbased + jb .w32_loop +.w32_end_loop: + mova [dstq+32*0], m6 + mova [dstq+32*1], m6 + add dstq, strideq + dec hd + jg .w32_end_loop +.w32_end: + RET +.w64: + %assign stack_offset org_stack_offset + ALLOC_STACK -256, 10 + lea maxbased, [hq+63] + test angled, 0x400 + jnz .w64_main + vpbroadcastd m2, [pw_3] + mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + paddw m1, m2 + paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i + mov r3, rsp + paddw m0, m1 + lea r5d, [hq+32] + psrlw m0, 2 + mova [r3], m0 +.w64_filter_loop: + mova m0, [tlq+30] + paddw m1, m2, [tlq+28] + add tlq, 32 + paddw m0, [tlq+0] + pavgw m1, [tlq+4] + paddw m0, [tlq+2] + add r3, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r3], m0 + sub r5d, 16 + jg .w64_filter_loop + movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + punpckhwd m1, m0, m0 + paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + pavgw m2, m3 + paddw m0, m1 + paddw m0, m2 + mov tlq, rsp + psrlw m0, 2 + mova [r3+32], m0 +.w64_main: + movd xm4, dxd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + movd xm0, maxbased + mov r5d, dxd + vpbroadcastd m7, [pw_m1024] ; -16 * 64 + vpbroadcastw m0, xm0 + paddw m3, m4, [z_base_inc] + paddw m8, m7, m7 ; -32 * 64 + psubw m3, m0 + paddw m9, m8, m7 ; -48 * 64 +.w64_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3*2] + movu m1, [tlq+r3*2+2] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + psraw m1, m3, 15 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*0], m0 + movu m0, [tlq+r3*2+32] + movu m1, [tlq+r3*2+34] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m7, m3 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*1], m0 + movu m0, [tlq+r3*2+64] + movu m1, [tlq+r3*2+66] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m8, m3 + vpblendvb m0, m6, m0, m1 + mova [dstq+32*2], m0 + movu m0, [tlq+r3*2+96] + movu m1, [tlq+r3*2+98] + add r5d, dxd + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m9, m3 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [dstq+32*3], m0 + dec hd + jz .w64_end + add dstq, strideq + cmp r5d, maxbased + jb .w64_loop +.w64_end_loop: + mova [dstq+32*0], m6 + mova [dstq+32*1], m6 + mova [dstq+32*2], m6 + mova [dstq+32*3], m6 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + +cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy +%define base r9-z_filter_t0 + lea r9, [ipred_z2_16bpc_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea dxq, [dr_intra_derivative-90] + movsxd wq, [r9+wq*4] + mova m1, [tlq- 0] + movzx dyd, angleb + xor angled, 0x400 + mova m2, [tlq- 32] + mov r8, dxq + sub dxq, dyq + mova m3, [tlq- 64] + add wq, r9 + add r9, z_filter_t0-ipred_z2_16bpc_avx2_table + mova m4, [tlq- 96] + and dyd, ~1 + mova m5, [tlq-128] + and dxq, ~1 + movzx dyd, word [r8+dyq] ; angle - 90 + movzx dxd, word [dxq+270] ; 180 - angle + vpbroadcastd m11, [base+pw_62] + mova [rsp+128], m1 + mova [rsp+ 96], m2 + mova [rsp+ 64], m3 + neg dxd + mova [rsp+ 32], m4 + neg dyq + mova [rsp+ 0], m5 + jmp wq +.w4: + vbroadcasti128 m10, [base+z2_x_shuf] + vpbroadcastq m6, [base+z_base_inc+2] + lea r8d, [dxq+(65<<6)] ; xpos + mov r10d, (63-4)<<6 + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + movq xm0, [tlq+2] ; 1 2 3 4 + movq xm1, [tlq+0] ; 0 1 2 3 + pshuflw xm2, xm0, q3321 ; 2 3 4 4 + pshuflw xm3, xm1, q2100 ; 0 0 1 2 + vpbroadcastw xm4, r8m ; pixel_max + vbroadcasti128 m10, [base+z_upsample] + paddw xm1, xm0 + paddw xm2, xm3 + lea r8d, [r8+dxq+(1<<6)] + psubw xm2, xm1, xm2 + add dxd, dxd + psraw xm2, 3 + pxor xm3, xm3 + sub r10d, 3<<6 + paddw xm1, xm2 + paddw m6, m6 + pmaxsw xm1, xm3 + sub angled, 1075 ; angle - 53 + pavgw xm1, xm3 + lea r3d, [hq+3] + pminsw xm1, xm4 + xor angled, 0x7f ; 180 - angle + punpcklwd xm1, xm0 + movu [rsp+130], xm1 + call .filter_strength + jmp .w4_filter_left +ALIGN function_align +.filter_strength: + movd xm8, r3d + mov r3d, angled + movd xm7, angled + vpbroadcastb m8, xm8 + shr r3d, 8 ; is_sm << 1 + vpbroadcastb m7, xm7 + pcmpeqb m8, [base+z_filter_wh] + mova xm9, [r9+r3*8] + pand m0, m8, m7 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + ret +ALIGN function_align +.upsample_left: ; h4/h8 + mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 + movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 + vpbroadcastw xm4, r8m ; pixel_max + cmp hd, 8 + je .upsample_left_h8 + pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 + pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0 + jmp .upsample_left_end +.upsample_left_h8: + pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2 + pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0 +.upsample_left_end: + paddw xm1, xm0 + paddw xm2, xm3 + psubw xm2, xm1, xm2 + add dyq, dyq + psraw xm2, 3 + pxor xm3, xm3 + paddw xm1, xm2 + pmaxsw xm1, xm3 + pavgw xm1, xm3 + pminsw xm1, xm4 + punpcklwd xm2, xm0, xm1 + punpckhwd xm0, xm1 + mova [rsp+ 96+gprsize], xm2 + mova [rsp+112+gprsize], xm0 + ret +.w4_no_upsample_above: + lea r3d, [hq+3] + sub angled, 1112 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w4_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] + psrldq xm0, xm1, 2 ; 1 2 3 4 + pshuflw xm2, xm1, q2100 ; 0 0 1 2 + pmullw xm4, xm0 + pshuflw xm3, xm0, q3321 ; 2 3 4 4 + paddw xm1, xm3 + pshuflw xm3, xm0, q3332 ; 3 4 4 4 + pmullw xm1, xm5 + vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2] + paddw xm2, xm3 + vpbroadcastd xm3, r6m ; max_width + pmullw xm2, xm5 + packssdw xm3, xm3 + paddw xm1, xm4 + paddw xm1, xm2 + psubw xm3, [base+pw_1to16] + pxor xm4, xm4 + psrlw xm1, 3 + pminsw xm3, xm11 ; clip to byte range since there's no variable word blend + pavgw xm1, xm4 + vpblendvb xm1, xm0, xm3 + movq [rsp+130], xm1 +.w4_no_filter_above: + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm0, [base+pb_90] + psubb xm0, xm7 ; 180 - angle + pand xm0, xm8 ; reuse from previous filter_strength call + pcmpgtb xm0, xm9 + pmovmskb r3d, xm0 +.w4_filter_left: + test r3d, r3d + jz .w4_main + popcnt r3d, r3d + mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + vpbroadcastd m5, r7m ; max_height + cmp r3d, 3 + je .w4_filter_left_s3 + vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] + pmullw m2, m0 + cmp hd, 8 + jl .w4_filter_left_h4 + movu m4, [tlq-34] + punpcklwd m1, m0, m0 + vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e + je .w4_filter_left_end + vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + jmp .w4_filter_left_end +.w4_upsample_left: + call .upsample_left + mov r11, -16 + vbroadcasti128 m9, [base+z_upsample] + jmp .w4_main_upsample_left +.w4_filter_left_s3: ; can only be h16 + movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m4, [base+pw_3] + paddw m1, m0, m2 + punpckhwd m2, m2 + vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + punpcklwd xm3, xm0, xm0 + paddw m2, m4 + vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e + vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d + paddw m1, m4 + pavgw m2, m3 + paddw m1, m2 + psrlw m1, 2 + jmp .w4_filter_left_end2 +.w4_filter_left_h4: + pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e +.w4_filter_left_end: + paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m3 + paddw m1, m2 + pxor m2, m2 + psrlw m1, 3 + pavgw m1, m2 +.w4_filter_left_end2: + packssdw m5, m5 + psubw m5, [base+pw_16to1] + pminsw m5, m11 + vpblendvb m1, m0, m5 + mova [rsp+96], m1 +.w4_main: + vbroadcasti128 m9, [base+z2_x_shuf] + mov r11, -8 +.w4_main_upsample_left: + movd xm5, dyd + mova m4, [base+z2_y_shuf_h4] + mov r2d, r8d + movd xm0, dxd + vpbroadcastw m5, xm5 + rorx r5, dyq, 5 + lea r8d, [dyq*3] + pmullw m5, [base+z2_ymul] + rorx r9, dyq, 4 + sar dyd, 6 + vpbroadcastw m0, xm0 + sar r8d, 6 + pand m5, m11 ; frac_y + neg dyd + psllw m5, 9 + add r5d, dyd + add r8d, dyd + add r9d, dyd + paddw m7, m0, m0 + lea dyq, [rsp+dyq*2+126] + vpblendd m0, m7, 0xcc + add dyq, r11 + neg r5d + paddw m1, m0, m7 + neg r8d + vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3 + neg r9d + paddw m7, m7 + paddw m6, m0 +.w4_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm1, [rsp+r2*2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + movu xm3, [rsp+r3*2] + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + vinserti128 m1, [rsp+r2*2], 1 + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + vinserti128 m3, [rsp+r3*2], 1 + pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3 + pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3 + pand m2, m11, m6 + punpcklqdq m0, m1, m3 + punpckhqdq m1, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + cmp r3d, 64 + jge .w4_toponly + movu xm2, [dyq] + vinserti128 m2, [dyq+r8*2], 1 + movu xm3, [dyq+r5*2] + vinserti128 m3, [dyq+r9*2], 1 + pshufb m2, m9 + pshufb m3, m9 + punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m2, m3 + psubw m2, m1 + pmulhrsw m2, m5 + psraw m3, m6, 15 ; base_x < topleft + paddw m1, m2 + vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 + vpblendvb m0, m1, m3 +.w4_toponly: + paddw m6, m7 ; xpos += dx + lea r3, [strideq*3] + add dyq, r11 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*4] + cmp r2d, r10d + jge .w4_loop +.w4_leftonly_loop: + movu xm1, [dyq] + vinserti128 m1, [dyq+r8*2], 1 + movu xm2, [dyq+r5*2] + vinserti128 m2, [dyq+r9*2], 1 + add dyq, r11 + pshufb m1, m9 + pshufb m2, m9 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + psubw m1, m0 + pmulhrsw m1, m5 + paddw m0, m1 + vpermd m0, m4, m0 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_leftonly_loop +.w4_end: + RET +.w8: + mov r10d, hd + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + xor r8d, r8d + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 + mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7 + pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 + pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 + vpbroadcastw xm4, r8m ; pixel_max + paddw xm1, xm0 + paddw xm2, xm3 + not r8d + psubw xm2, xm1, xm2 + add dxd, dxd + psraw xm2, 3 + sub angled, 53 ; angle - 53 + pxor xm3, xm3 + paddw xm2, xm1 + lea r3d, [hq+7] + pmaxsw xm2, xm3 + xor angled, 0x7f ; 180 - angle + pavgw xm2, xm3 + pminsw xm2, xm4 + punpcklwd xm1, xm2, xm0 + punpckhwd xm2, xm0 + movu [rsp+130], xm1 + movu [rsp+146], xm2 + call .filter_strength + jmp .w8_filter_left +.w8_no_upsample_above: + lea r3d, [hq+7] + sub angled, 90 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w8_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2] + movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x + pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x + pmullw xm4, xm0 + pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x + paddw xm1, xm3 + vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x + paddw xm2, xm3 + vpbroadcastd xm3, r6m ; max_width + pmullw xm1, xm5 + pmullw xm2, xm6 + packssdw xm3, xm3 + paddw xm1, xm4 + paddw xm1, xm2 + psubw xm3, [base+pw_1to16] + pxor xm4, xm4 + psrlw xm1, 3 + pminsw xm3, xm11 + pavgw xm1, xm4 + vpblendvb xm1, xm0, xm3 + movu [rsp+130], xm1 +.w8_no_filter_above: + lea r3d, [angleq-51] + mov r3b, hb + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 +.w8_filter_left: + test r3d, r3d + jz .w8_main + popcnt r3d, r3d + cmp r3d, 3 + jne .w8_filter_left_s12 + vpbroadcastd m6, [base+pw_3] + vpbroadcastd m7, [base+pw_16] + cmp hd, 16 ; flags needed for later + jmp .filter_left_s3b +.w8_upsample_left: + call .upsample_left + vbroadcasti128 m7, [base+z2_y_shuf_us] + lea r11, [rsp+118] + mov r8, -8 + jmp .w8_main_upsample_left +.w16_filter_left_s12: + xor r8d, r8d +.w8_filter_left_s12: + mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + vpbroadcastd m5, r7m ; max_height + vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] + pmullw m2, m0 + cmp hd, 8 + jl .w8_filter_left_h4 + movu m4, [tlq-34] + punpcklwd m1, m0, m0 + vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e + je .w8_filter_left_end + vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + jmp .w8_filter_left_end +.w8_filter_left_h4: + pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e +.w8_filter_left_end: + paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pmullw m1, m3 + paddw m1, m2 + pxor m2, m2 + psrlw m1, 3 + pavgw m1, m2 + packssdw m5, m5 + psubw m5, [base+pw_16to1] + pminsw m5, m11 + vpblendvb m1, m0, m5 + mova [rsp+96], m1 + test r8d, r8d + jz .w8_main +; upsample_main + vbroadcasti128 m10, [base+z_upsample] + vbroadcasti128 m7, [base+z2_y_shuf] + lea r5, [rsp+120] + movd xm1, dyd + vbroadcasti128 m4, [base+z_base_inc+2] + movd xm2, dxd + vpbroadcastw m1, xm1 + vpbroadcastw m2, xm2 + mov r7, dstq + paddw m4, m4 + pmullw m0, m1, [base+z2_ymul8] + paddw m5, m2, m2 + psllw xm1, 3 + vpblendd m2, m5, 0xf0 + lea r2d, [dxq+(66<<6)] ; xpos + paddw m4, m2 + pshufd m6, m0, q2020 + psraw xm0, 6 + pxor xm1, xm1 + psubw xm8, xm1, xm0 + pand m6, m11 + punpckhwd xm9, xm8, xm1 + psllw m6, 9 + punpcklwd xm8, xm1 +.w8_upsample_above_loop: + lea r3d, [r2+dxq] + shr r2d, 6 + movu xm1, [rsp+r2*2] + movu xm2, [rsp+r2*2+16] + lea r2d, [r3+dxq] + shr r3d, 6 + vinserti128 m1, [rsp+r3*2], 1 + vinserti128 m2, [rsp+r3*2+16], 1 + pshufb m1, m10 + pshufb m2, m10 + punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 + punpckhqdq m1, m2 + pand m2, m11, m4 + psubw m1, m0 + psllw m2, 9 + pmulhrsw m1, m2 + paddw m0, m1 + cmp r3d, 64 + jge .w8_upsample_above_toponly + mova m1, m5 + vpgatherdq m3, [r5+xm9*2], m5 + mova m5, m1 + vpgatherdq m2, [r5+xm8*2], m1 + pshufb m3, m7 + pshufb m2, m7 + punpckldq m1, m2, m3 + punpckhdq m2, m3 + psubw m2, m1 + pmulhrsw m2, m6 + paddw m1, m2 + vpermq m1, m1, q3120 + psraw m2, m4, 15 + vpblendvb m0, m1, m2 +.w8_upsample_above_toponly: + paddw m4, m5 + sub r5, 4 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w8_ret + lea dstq, [dstq+strideq*2] + jmp .w8_upsample_above_loop +.w8_main: + vbroadcasti128 m7, [base+z2_y_shuf] + lea r11, [rsp+120] + mov r8, -4 +.w8_main_upsample_left: + movd xm1, dyd + vbroadcasti128 m4, [base+z_base_inc+2] + movd xm2, dxd + vpbroadcastw m1, xm1 + vpbroadcastw m2, xm2 + mov r7, dstq + pmullw m0, m1, [base+z2_ymul8] + paddw m5, m2, m2 + psllw xm1, 3 + vpblendd m2, m5, 0xf0 ; xpos0 xpos1 + lea r9d, [dxq+(65<<6)] ; xpos + paddw m4, m2 + movd [rsp+284], xm1 +.w8_loop0: + mov r2d, r9d + mova [rsp+288], m0 + mov r5, r11 + mova [rsp+320], m4 + pshufd m6, m0, q2020 + psraw xm0, 6 + pxor xm1, xm1 + psubw xm8, xm1, xm0 ; base_y + pand m6, m11 ; frac_y + punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7 + psllw m6, 9 + punpcklwd xm8, xm1 ; base_y 0 1 4 5 +.w8_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm0, [rsp+r2*2] + movu xm1, [rsp+r2*2+2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + vinserti128 m0, [rsp+r3*2], 1 + vinserti128 m1, [rsp+r3*2+2], 1 + pand m2, m11, m4 + psubw m1, m0 + psllw m2, 9 + pmulhrsw m1, m2 + paddw m0, m1 + cmp r3d, 64 + jge .w8_toponly + mova m1, m5 + vpgatherdq m3, [r5+xm9*2], m5 + mova m5, m1 + vpgatherdq m2, [r5+xm8*2], m1 + pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1 + pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1 + punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m2, m3 + psubw m2, m1 + pmulhrsw m2, m6 + paddw m1, m2 + vpermq m1, m1, q3120 + psraw m2, m4, 15 ; base_x < topleft + vpblendvb m0, m1, m2 +.w8_toponly: + paddw m4, m5 ; xpos += dx + add r5, r8 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w8_end + lea dstq, [dstq+strideq*2] + cmp r2d, (63-8)<<6 + jge .w8_loop +.w8_leftonly_loop: + mova m0, m5 + vpgatherdq m4, [r5+xm9*2], m5 + mova m5, m0 + vpgatherdq m3, [r5+xm8*2], m0 + add r5, r8 + pshufb m2, m4, m7 + pshufb m1, m3, m7 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + psubw m1, m0 + pmulhrsw m1, m6 + paddw m0, m1 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_leftonly_loop +.w8_end: + sub r10d, 1<<8 + jl .w8_ret + vpbroadcastd m0, [rsp+284] + add r7, 16 + paddw m0, [rsp+288] ; base_y += 8*dy + add r9d, 8<<6 + vpbroadcastd m4, [pw_512] + movzx hd, r10b + paddw m4, [rsp+320] ; base_x += 8*64 + mov dstq, r7 + jmp .w8_loop0 +.w8_ret: + RET +.w16: + movd xm0, [tlq+32] + lea r10d, [hq+(1<<8)] + movd [rsp+160], xm0 + test angled, 0x400 + jnz .w8_main + lea r3d, [hq+15] + sub angled, 90 + call .filter_strength + test r3d, r3d + jz .w16_no_filter_above + popcnt r3d, r3d + vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2] + movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + punpcklwd xm2, xm1, xm1 + vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + punpckhwd m3, m0, m0 + pmullw m4, m0 + vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + paddw m1, m3 + vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g + paddw m2, m3 + vpbroadcastd m3, r6m ; max_width + pmullw m1, m5 + pmullw m2, m6 + packssdw m3, m3 + paddw m1, m4 + paddw m1, m2 + psubw m3, [base+pw_1to16] + pxor m4, m4 + psrlw m1, 3 + pminsw m3, m11 + pavgw m1, m4 + vpblendvb m1, m0, m3 + movu [rsp+130], m1 +.w16_no_filter_above: + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + test r3d, r3d + jz .w8_main + popcnt r3d, r3d + cmp r3d, 3 + jne .w16_filter_left_s12 + vpbroadcastd m6, [base+pw_3] + vpbroadcastd m7, [base+pw_16] + cmp hd, 4 + jne .filter_left_s3 + movq xm0, [tlq-8] ; 0 1 2 3 + movq xm1, [tlq-6] ; 1 2 3 4 + vpbroadcastd xm5, r7m ; max_height + movq xm4, [base+pw_16to1+24] ; 4to1 + pshuflw xm2, xm0, q2100 ; 0 0 1 2 + pshuflw xm3, xm1, q3321 ; 2 3 4 4 + paddw xm1, xm0 + paddw xm1, xm2 + pshuflw xm2, xm0, q1000 ; 0 0 0 1 + paddw xm3, xm6 + packssdw xm5, xm5 + pavgw xm2, xm3 + psubw xm5, xm4 + paddw xm1, xm2 + pminsw xm5, xm11 + psrlw xm1, 2 + vpblendvb xm1, xm0, xm5 + movq [rsp+120], xm1 + jmp .w8_main +.w32: + mova m2, [tlq+32] + movd xm0, [tlq+64] + lea r10d, [hq+(3<<8)] + mova [rsp+160], m2 + movd [rsp+192], xm0 + test angled, 0x400 + jnz .w8_main + vpbroadcastd m6, [base+pw_3] + vpbroadcastd m0, r6m ; max_width + vpbroadcastd m7, [base+pw_16] + mov r3d, 32 + packssdw m0, m0 + psubw m0, [base+pw_1to16] + pminsw m8, m0, m11 + psubw m9, m8, m7 +.w32_filter_above: + movu m0, [tlq+2] + punpcklwd xm4, xm1, xm1 + paddw m2, m6, [tlq+6] + paddw m1, m0 + vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m1, [tlq+4] + movu m3, [tlq+r3+2] + paddw m5, m6, [tlq+r3-2] + pavgw m2, m4 + punpckhwd m4, m3, m3 + paddw m1, m2 + vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h + vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h + pavgw m2, m5 + paddw m5, m3, [tlq+r3] + paddw m4, m5 + psrlw m1, 2 + paddw m2, m4 + vpblendvb m1, m0, m8 + psrlw m2, 2 + vpblendvb m2, m3, m9 + movu [rsp+130], m1 + movu [rsp+r3+130], m2 +.filter_left_s3: + cmp hd, 16 + jl .filter_left_s3_h8 ; h8 +.filter_left_s3b: + mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h + movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i + vpbroadcastd m5, r7m ; max_height + paddw m1, m0, m2 + punpckhwd m2, m2 + mov r3d, hd + vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + packssdw m5, m5 + not r3 + psubw m5, [base+pw_16to1] + paddw m2, m6 + pminsw m8, m11, m5 + je .filter_left_s3_end ; h16 + paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m2 + psrlw m1, 2 + vpblendvb m3, m1, m0, m8 + mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j + psubw m8, m7 + mova [rsp+96], m3 + jnp .filter_left_s3_end ; h32 + mova m5, [tlq-96] + paddw m1, [tlq-66] + pavgw m2, [tlq-68] + paddw m1, m2 + paddw m4, m5, [tlq-94] + paddw m2, m6, [tlq-92] + psrlw m1, 2 + paddw m4, [tlq- 98] + pavgw m2, [tlq-100] + vpblendvb m3, m1, m0, m8 + mova m0, [tlq-128] + psubw m8, m7 + paddw m4, m2 + paddw m1, m0, [tlq-126] + paddw m2, m6, [tlq-124] + psrlw m4, 2 + mova [rsp+64], m3 + vpblendvb m4, m5, m8 + psubw m8, m7 + mova [rsp+32], m4 +.filter_left_s3_end: + punpcklwd xm3, xm0, xm0 + vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m4 + pavgw m2, m3 + paddw m1, m2 + psrlw m1, 2 + vpblendvb m1, m0, m8 + mova [rsp+r3*2+130], m1 + jmp .w8_main +.filter_left_s3_h8: + mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7 + movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8 + pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6 + vpbroadcastd xm5, r7m ; max_height + paddw xm1, xm0, xm3 + pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8 + paddw xm1, xm2 + vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5 + paddw xm3, xm6 + packssdw xm5, xm5 + pavgw xm2, xm3 + psubw xm5, [base+pw_16to1+16] ; 8to1 + paddw xm1, xm2 + pminsw xm5, xm11 + psrlw xm1, 2 + vpblendvb xm1, xm0, xm5 + mova [rsp+112], xm1 + jmp .w8_main +.w64: + mova m2, [tlq+ 32] + mova m3, [tlq+ 64] + mova m4, [tlq+ 96] + movd xm0, [tlq+128] + lea r10d, [hq+(7<<8)] + mova [rsp+160], m2 + mova [rsp+192], m3 + mova [rsp+224], m4 + movd [rsp+256], xm0 + test angled, 0x400 + jnz .w8_main + vpbroadcastd m6, [base+pw_3] + movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h + paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h + movu m4, [tlq+66] + paddw m3, m6, [tlq+62] + paddw m7, m4, [tlq+64] + pavgw m3, [tlq+70] + paddw m7, [tlq+68] + paddw m2, m5 + vpbroadcastd m5, r6m ; max_width + mov r3d, 96 + packssdw m5, m5 + paddw m3, m7 + psubw m5, [base+pw_1to16] + psrlw m2, 2 + vpbroadcastd m7, [base+pw_16] + psrlw m3, 2 + pminsw m8, m11, m5 + psubw m9, m8, m7 + vpblendvb m2, m0, m9 + psubw m9, m7 + vpblendvb m3, m4, m9 + psubw m9, m7 + movu [rsp+162], m2 + movu [rsp+194], m3 + jmp .w32_filter_above + +cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z3_16bpc_avx2_table] + tzcnt hd, hm + movifnidn angled, anglem + lea r7, [dr_intra_derivative+45*2-1] + sub tlq, 2 + movsxd hq, [r6+hq*4] + sub angled, 180 + add hq, r6 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + movzx dyd, word [r7+dyq] + vpbroadcastd m5, [pw_62] + mov org_wd, wd + jmp hq +.h4: + ALLOC_STACK -64, 7 + lea r7, [strideq*3] + cmp angleb, 40 + jae .h4_no_upsample + lea r4d, [angleq-1024] + sar r4d, 7 + add r4d, wd + jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) + mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7 + pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 + vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 + pshufd xm3, xm1, q0000 + paddw xm1, xm2 + paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8 + vpbroadcastw xm4, r8m ; pixel_max + add dyd, dyd + psubw xm0, xm1, xm0 + mova [rsp+ 0], xm3 + movd xm3, dyd + psraw xm0, 3 + neg dyd + paddw xm1, xm0 + pxor xm0, xm0 + lea r2d, [dyq+(16<<6)+63] ; ypos + pmaxsw xm1, xm0 + pavgw xm1, xm0 + vpbroadcastw m3, xm3 + pminsw xm1, xm4 + punpckhwd xm0, xm1, xm2 + punpcklwd xm1, xm2 + paddw m2, m3, m3 + mova [rsp+32], xm0 + punpcklwd m3, m2 + mova [rsp+16], xm1 + paddw m4, m2, m2 + paddw m2, m3 + vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3 +.h4_upsample_loop: + lea r4d, [r2+dyq] + shr r2d, 6 + movu xm1, [rsp+r2*2] + lea r2d, [r4+dyq] + shr r4d, 6 + movu xm2, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 + vinserti128 m1, [rsp+r2*2], 1 + lea r2d, [r4+dyq] + shr r4d, 6 + vinserti128 m2, [rsp+r4*2], 1 + psrld m0, m1, 16 + pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 + pslld m2, 16 + pblendw m1, m2, 0xaa + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m3, m4 + paddw m1, m0 + vextracti128 xm2, m1, 1 + punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 + movhps [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r7 ], xm1 + add dstq, 8 + sub wd, 4 + jg .h4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; h4/h8/h16 +%define base r4-z_filter_t0 + lea r4, [z_filter_t0] + movd xm0, maxbased + movd xm1, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m1, xm1 + pcmpeqb m0, [base+z_filter_wh] + pand m0, m1 + mova xm1, [r4+angleq*8] + pcmpgtb m0, m1 + pmovmskb r5d, m0 + ret +.h4_no_upsample: + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h4_main + lea maxbased, [wq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .h4_main ; filter_strength == 0 + popcnt r5d, r5d + mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7 + movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8 + vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] + pmullw xm2, xm0 + pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 + paddw xm1, xm0, xm3 + movd [rsp+12], xm0 + pmullw xm1, xm4 + cmp r5d, 3 + jne .h4_filter_3tap + pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8 + vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 + movzx r4d, word [tlq-14] + movzx r2d, word [tlq-12] + inc maxbased + paddw xm1, xm2 + paddw xm0, xm3 + sub r2d, r4d + paddw xm2, xm0, xm0 + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+14], r2w +.h4_filter_3tap: + pxor xm0, xm0 + paddw xm1, xm2 + lea tlq, [rsp+30] + psrlw xm1, 3 + cmp wd, 8 + sbb maxbased, -1 + pavgw xm0, xm1 + mova [rsp+16], xm0 +.h4_main: + movd xm3, dyd + neg maxbaseq + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m3, xm3 + lea r4d, [maxbaseq+3*64] + neg dyq + movd xm2, r4d + sub tlq, 8 + lea r4, [dyq+63] ; ypos + punpcklwd m1, m1 + paddw m0, m3, m3 + vpbroadcastw m2, xm2 + punpcklwd m3, m0 + paddw m4, m0, m0 + paddw m0, m3 + psubw m2, m1 + vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3 + or maxbased, 63 + paddw m3, m2 +.h4_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu xm1, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + movu xm2, [tlq+r5*2] + lea r5, [r4+dyq] + sar r4, 6 ; base2 + vinserti128 m1, [tlq+r4*2], 1 + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vinserti128 m2, [tlq+r5*2], 1 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + pand m2, m5, m3 + palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m3, 15 ; ypos < max_base_y + paddw m3, m4 + paddw m1, m0 + vpblendvb m1, m6, m1, m2 + vextracti128 xm2, m1, 1 + punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 + movhps [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r7 ], xm1 + sub wd, 4 + jz .h4_end + add dstq, 8 + cmp r4d, maxbased + jg .h4_loop +.h4_end_loop: + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + movq [dstq+strideq*2], xm6 + movq [dstq+r7 ], xm6 + add dstq, 8 + sub wd, 4 + jg .h4_end_loop +.h4_end: + RET +.h8: + lea r4d, [angleq+216] + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 8 + mov r4b, wb + lea r7, [strideq*3] + cmp r4d, 8 + ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 + mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e + movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d + cmp wd, 8 + je .h8_upsample_w8 + pshufhw xm3, xm2, q1000 + vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d +.h8_upsample_w8: + paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastw m4, r8m ; pixel_max + add dyd, dyd + psubw m0, m1, m0 + movd xm6, dyd + psraw m0, 3 + neg dyd + paddw m1, m0 + pxor m0, m0 + pmaxsw m1, m0 + lea r4d, [dyq+(16<<6)+63] ; ypos + pavgw m1, m0 + vpbroadcastw m6, xm6 + pminsw m1, m4 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 + vextracti128 [rsp+48], m0, 1 + vextracti128 [rsp+32], m1, 1 + paddw m7, m6, m6 + mova [rsp+16], xm0 + mova [rsp+ 0], xm1 + punpcklwd m6, m7 ; ypos0 ypos1 +.h8_upsample_loop: + lea r2d, [r4+dyq] + shr r4d, 6 ; base0 + movu m1, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 ; base1 + movu m2, [rsp+r2*2] + lea r2d, [r4+dyq] + shr r4d, 6 ; base2 + movu m3, [rsp+r4*2] + lea r4d, [r2+dyq] + shr r2d, 6 ; base3 + movu m4, [rsp+r2*2] + psrld m0, m1, 16 + pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0 + pslld m2, 16 + pblendw m1, m2, 0xaa + psrld m2, m3, 16 + pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0 + pslld m4, 16 + pblendw m3, m4, 0xaa + pand m4, m5, m6 + paddw m6, m7 + psllw m4, 9 + psubw m1, m0 + pmulhrsw m1, m4 + pand m4, m5, m6 + psllw m4, 9 + psubw m3, m2 + pmulhrsw m3, m4 + paddw m6, m7 + lea r2, [dstq+strideq*4] + paddw m1, m0 + paddw m3, m2 + punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2 + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + movhps [r2 +strideq*0], xm0 + movq [r2 +strideq*1], xm0 + movhps [r2 +strideq*2], xm1 + movq [r2 +r7 ], xm1 + movhps [dstq+strideq*0], xm2 + movq [dstq+strideq*1], xm2 + movhps [dstq+strideq*2], xm3 + movq [dstq+r7 ], xm3 + add dstq, 8 + sub wd, 4 + jg .h8_upsample_loop + RET +.h8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(w+7, 15) + jmp .h8_main +.h8_no_upsample: + lea maxbased, [wq+7] + test angled, 0x400 + jnz .h8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h8_main + popcnt r5d, r5d + mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] + pmullw m2, m0 + cmp wd, 8 + jl .h8_filter_w4 + punpcklwd xm0, xm0 + vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movd [rsp+28], xm0 + paddw m1, m3 + mov r4d, 16 + pmullw m1, m4 + cmovg maxbased, r4d + cmp r5d, 3 + jne .h8_filter_3tap + punpckhwd m3, m3 + vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g + movzx r4d, word [tlq-30] + movzx r2d, word [tlq-28] + inc maxbased + paddw m1, m2 + paddw m0, m3 + sub r2d, r4d + paddw m2, m0, m0 + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+30], r2w + jmp .h8_filter_3tap +.h8_filter_w4: + pshufhw xm1, xm0, q2100 + vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e + paddw m1, m3 + pmullw m1, m4 +.h8_filter_3tap: + pxor m0, m0 + paddw m1, m2 + lea tlq, [rsp+62] + psrlw m1, 3 + pavgw m0, m1 + mova [rsp+32], m0 +.h8_main: + movd xm4, dyd + neg maxbaseq + vbroadcasti128 m1, [z_base_inc] + vpbroadcastw m7, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + lea r4d, [maxbaseq+7*64] + neg dyq + movd xm2, r4d + sub tlq, 16 + lea r4, [dyq+63] + paddw m6, m4, m4 + vpbroadcastw m2, xm2 + vpblendd m4, m6, 0xf0 ; ypos0 ypos1 + psubw m2, m1 + or maxbased, 63 + paddw m4, m2 +.h8_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu xm0, [tlq+r4*2+2] + movu xm1, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + vinserti128 m0, [tlq+r5*2+2], 1 + vinserti128 m1, [tlq+r5*2], 1 + lea r5, [r4+dyq] + sar r4, 6 ; base2 + pand m3, m5, m4 + psllw m3, 9 + psubw m1, m0 + pmulhrsw m1, m3 + psraw m3, m4, 15 + paddw m4, m6 + paddw m0, m1 + movu xm1, [tlq+r4*2+2] + movu xm2, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vpblendvb m0, m7, m0, m3 + vinserti128 m1, [tlq+r5*2+2], 1 + vinserti128 m2, [tlq+r5*2], 1 + pand m3, m5, m4 + psllw m3, 9 + psubw m2, m1 + pmulhrsw m2, m3 + psraw m3, m4, 15 + paddw m4, m6 + lea r5, [dstq+strideq*4] + paddw m1, m2 + vpblendvb m1, m7, m1, m3 + punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0 + vextracti128 xm3, m2, 1 + punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4 + punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2 + vextracti128 xm3, m0, 1 + movhps [dstq+strideq*0], xm1 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movq [dstq+r7 ], xm2 + punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4 + punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6 + movhps [r5 +strideq*0], xm1 + movq [r5 +strideq*1], xm1 + movhps [r5 +strideq*2], xm0 + movq [r5 +r7 ], xm0 + sub wd, 4 + jz .h8_end + add dstq, 8 + cmp r4d, maxbased + jg .h8_loop + lea r6, [strideq*5] + lea r2, [strideq+r7*2] ; stride*7 + test wd, 4 + jz .h8_end_loop + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + movq [dstq+strideq*2], xm7 + movq [dstq+r7 ], xm7 + movq [dstq+strideq*4], xm7 + movq [dstq+r6 ], xm7 + movq [dstq+r7*2 ], xm7 + movq [dstq+r2 ], xm7 + add dstq, 8 + sub wd, 4 + jz .h8_end +.h8_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + mova [dstq+strideq*2], xm7 + mova [dstq+r7 ], xm7 + mova [dstq+strideq*4], xm7 + mova [dstq+r6 ], xm7 + mova [dstq+r7*2 ], xm7 + mova [dstq+r2 ], xm7 + add dstq, 16 + sub wd, 8 + jg .h8_end_loop +.h8_end: + RET +.h16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(w+15, 31) + jmp .h16_main +ALIGN function_align +.h16: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 10 + lea maxbased, [wq+15] + lea r7, [strideq*3] + test angled, 0x400 + jnz .h16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h16_main ; filter_strength == 0 + popcnt r5d, r5d + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] + pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + pmullw m1, m7 + paddw m1, m2 + cmp wd, 8 + jg .h16_filter_w16 + mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7 + pmullw xm6, xm3 + jl .h16_filter_w4 + pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 + cmp r5d, 3 + jne .h16_filter_w8_3tap + vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 +.h16_filter_w8_5tap: + punpckhwd m0, m0 + vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9 + paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + paddw xm4, xm4 + paddw m0, m0 + paddw xm6, xm4 + paddw m1, m0 +.h16_filter_w8_3tap: + paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8 + pmullw xm3, xm7 + pxor m0, m0 + paddw xm3, xm6 + psrlw xm3, 3 + pavgw xm3, xm0 + mova [rsp+48], xm3 + jmp .h16_filter_end +.h16_filter_w4: + pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6 + cmp r5d, 3 + jne .h16_filter_w8_3tap + pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5 + jmp .h16_filter_w8_5tap +.h16_filter_w16: + mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + pmullw m6, m3 + punpcklwd xm3, xm3 + vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + mov r4d, 32 + cmp wd, 16 + cmovg maxbased, r4d + movd [rsp+28], xm3 + pmullw m4, m7 + cmp r5d, 3 + jne .h16_filter_w16_3tap + punpckhwd m0, m0 + vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + movzx r4d, word [tlq-62] + movzx r2d, word [tlq-60] + or maxbased, 1 + paddw m3, m3 + sub r2d, r4d + paddw m0, m0 + lea r2d, [r2+r4*8+4] + paddw m4, m3 + shr r2d, 3 + paddw m1, m0 + mov [rsp+30], r2w +.h16_filter_w16_3tap: + pxor m0, m0 + paddw m4, m6 + psrlw m4, 3 + pavgw m4, m0 + mova [rsp+32], m4 +.h16_filter_end: + psrlw m1, 3 + lea tlq, [rsp+94] + pavgw m1, m0 + mova [rsp+64], m1 +.h16_main: + movd xm8, dyd + neg maxbaseq + vpbroadcastw m9, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m8, xm8 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + movd xm7, r4d + sub tlq, 32 + lea r4, [dyq+63] + vpbroadcastw m7, xm7 + or maxbased, 63 + psubw m7, [z_base_inc] +.h16_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + movu m0, [tlq+r4*2+2] + movu m2, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + movu m1, [tlq+r5*2+2] + movu m3, [tlq+r5*2] + lea r5, [r4+dyq] + sar r4, 6 ; base3 + pand m6, m5, m7 + psllw m6, 9 + psubw m2, m0 + pmulhrsw m2, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m0, m2 + movu m2, [tlq+r4*2+2] + movu m4, [tlq+r4*2] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + vpblendvb m0, m9, m0, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m3, m1 + pmulhrsw m3, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m1, m3 + vpblendvb m1, m9, m1, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m4, m2 + pmulhrsw m4, m6 + psraw m6, m7, 15 + paddw m7, m8 + paddw m2, m4 + movu m3, [tlq+r5*2+2] + movu m4, [tlq+r5*2] + vpblendvb m2, m9, m2, m6 + pand m6, m5, m7 + psllw m6, 9 + psubw m4, m3 + pmulhrsw m4, m6 + psraw m6, m7, 15 + paddw m7, m8 + lea r5, [dstq+strideq*4] + paddw m3, m4 + vpblendvb m3, m9, m3, m6 + punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4 + punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0 + punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4 + punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0 + vextracti128 xm6, m3, 1 + punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2 + punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4 + punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6 + vextracti128 xm2, m4, 1 + movhps [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm6 + vextracti128 xm6, m1, 1 + movhps [dstq+strideq*2], xm2 + movq [dstq+r7 ], xm2 + vextracti128 xm2, m0, 1 + movhps [r5 +strideq*0], xm6 + movq [r5 +strideq*1], xm6 + movhps [r5 +strideq*2], xm2 + movq [r5 +r7 ], xm2 + lea r5, [dstq+strideq*8] + movhps [r5 +strideq*0], xm3 + movq [r5 +strideq*1], xm3 + movhps [r5 +strideq*2], xm4 + movq [r5 +r7 ], xm4 + lea r5, [r5+strideq*4] + movhps [r5 +strideq*0], xm1 + movq [r5 +strideq*1], xm1 + movhps [r5 +strideq*2], xm0 + movq [r5 +r7 ], xm0 + sub wd, 4 + jz .h16_end + add dstq, 8 + cmp r4d, maxbased + jg .h16_loop + mov hd, 4 +.h16_end_loop0: + mov r6d, wd + mov r2, dstq + test wb, 4 + jz .h16_end_loop + movq [dstq+strideq*0], xm9 + movq [dstq+strideq*1], xm9 + movq [dstq+strideq*2], xm9 + movq [dstq+r7 ], xm9 + and r6d, 120 + jz .h16_end_w4 + add dstq, 8 +.h16_end_loop: + mova [dstq+strideq*0], xm9 + mova [dstq+strideq*1], xm9 + mova [dstq+strideq*2], xm9 + mova [dstq+r7 ], xm9 + add dstq, 16 + sub r6d, 8 + jg .h16_end_loop +.h16_end_w4: + lea dstq, [r2+strideq*4] + dec hd + jg .h16_end_loop0 +.h16_end: + RET +.h32: + %assign stack_offset org_stack_offset + ALLOC_STACK -160, 9 + lea maxbased, [wq+31] + and maxbased, 31 + or maxbased, 32 ; imin(w+31, 63) + test angled, 0x400 + jnz .h32_main + vpbroadcastd m2, [pw_3] + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + punpckhwd m1, m0, m0 + vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m2 + paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + lea r4, [rsp+128] + paddw m0, m1 + lea r5d, [maxbaseq-31] + psrlw m0, 2 + mova [r4], m0 +.h32_filter_loop: + mova m0, [tlq-62] + paddw m1, m2, [tlq-66] + paddw m0, [tlq-64] + pavgw m1, [tlq-58] + paddw m0, [tlq-60] + sub tlq, 32 + sub r4, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r4], m0 + sub r5d, 16 + jg .h32_filter_loop + jl .h32_filter_h8 + mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + movzx r5d, word [tlq-62] + movzx r2d, word [tlq-60] + pavgw m2, m3 + sub r2d, r5d + paddw m0, m1 + lea r2d, [r2+r5*8+4] + paddw m0, m2 + shr r2d, 3 + psrlw m0, 2 + mova [r4-32], m0 + mov [r4-36], r5w + mov [r4-34], r2w + lea tlq, [rsp+158] + mov r4d, 65 + cmp wd, 64 + cmove maxbased, r4d + jmp .h32_main +.h32_filter_h8: + mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7 + pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 + paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9 + paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8 + vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 + lea tlq, [rsp+158] + pavgw xm2, xm3 + paddw xm0, xm1 + paddw xm0, xm2 + psrlw xm0, 2 + mova [r4-16], xm0 +.h32_main: + movd xm6, dyd + neg maxbaseq + vpbroadcastw m7, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m6, xm6 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + movd xm4, r4d + vpbroadcastd m8, [pw_m1024] + lea r4, [dyq+63] + vpbroadcastw m4, xm4 + or maxbased, 63 + psubw m4, [z_base_inc] +.h32_loop: + mov r5, r4 + sar r5, 6 + movu m1, [tlq+r5*2-64] + movu m0, [tlq+r5*2-62] + pand m3, m5, m4 + psllw m3, 9 + psubw m1, m0 + pmulhrsw m1, m3 + pcmpgtw m2, m8, m4 + paddw m0, m1 + vpblendvb m0, m7, m0, m2 + movu m2, [tlq+r5*2-32] + movu m1, [tlq+r5*2-30] + add r4, dyq + sub rsp, 64 + psubw m2, m1 + pmulhrsw m2, m3 + psraw m3, m4, 15 + paddw m4, m6 + mova [rsp+32*0], m0 + paddw m1, m2 + vpblendvb m1, m7, m1, m3 + mova [rsp+32*1], m1 + dec wd + jz .h32_transpose + cmp r4d, maxbased + jg .h32_loop +.h32_end_loop: + sub rsp, 64 + mova [rsp+32*0], m7 + mova [rsp+32*1], m7 + dec wd + jg .h32_end_loop +.h32_transpose: + lea r3, [strideq*3] + lea r4, [strideq*5] + mov r8, dstq + lea r5, [strideq+r3*2] +.h32_transpose_loop0: + lea r6, [rsp+32] + lea r2, [r8+org_wq*2-16] +.h32_transpose_loop: + mova m0, [r6+64*7] + mova m1, [r6+64*6] + mova m2, [r6+64*5] + mova m3, [r6+64*4] + mova m4, [r6+64*3] + mova m5, [r6+64*2] + mova m6, [r6+64*1] + mova m7, [r6+64*0] + punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0 + punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4 + punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0 + punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4 + punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0 + punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4 + punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0 + punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4 + lea dstq, [r2+strideq*8] + sub r6, 32 + punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0 + punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2 + punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0 + punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2 + punpckhqdq m5, m7, m1 ; 8 0 + vextracti128 [r2 +strideq*0], m5, 1 + punpcklqdq m7, m1 ; 9 1 + mova [dstq+strideq*0], xm5 + punpckhqdq m1, m8, m3 ; 10 2 + vextracti128 [r2 +strideq*1], m7, 1 + punpcklqdq m8, m3 ; 11 3 + mova [dstq+strideq*1], xm7 + punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4 + vextracti128 [r2 +strideq*2], m1, 1 + punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6 + mova [dstq+strideq*2], xm1 + punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4 + vextracti128 [r2 +r3 ], m8, 1 + punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6 + mova [dstq+r3 ], xm8 + punpckhqdq m6, m3, m2 ; 12 4 + vextracti128 [r2 +strideq*4], m6, 1 + punpcklqdq m3, m2 ; 13 5 + mova [dstq+strideq*4], xm6 + punpckhqdq m2, m0, m4 ; 14 6 + vextracti128 [r2 +r4 ], m3, 1 + punpcklqdq m0, m4 ; 15 7 + mova [dstq+r4 ], xm3 + vextracti128 [r2 +r3*2 ], m2, 1 + mova [dstq+r3*2 ], xm2 + vextracti128 [r2 +r5 ], m0, 1 + mova [dstq+r5 ], xm0 + lea r2, [dstq+strideq*8] + cmp r6, rsp + jae .h32_transpose_loop + add rsp, 64*8 + sub org_wd, 8 + jg .h32_transpose_loop0 +.h32_end: + RET +.h64: + %assign stack_offset org_stack_offset + ALLOC_STACK -256, 10 + lea maxbased, [wq+63] + test angled, 0x400 + jnz .h64_main + vpbroadcastd m2, [pw_3] + movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i + punpckhwd m1, m0, m0 + vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i + paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m1, m2 + paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + lea r4, [rsp+224] + paddw m0, m1 + lea r5d, [wq+32] + psrlw m0, 2 + mova [r4], m0 +.h64_filter_loop: + mova m0, [tlq-62] + paddw m1, m2, [tlq-66] + paddw m0, [tlq-64] + pavgw m1, [tlq-58] + paddw m0, [tlq-60] + sub tlq, 32 + sub r4, 32 + paddw m0, m1 + psrlw m0, 2 + mova [r4], m0 + sub r5d, 16 + jg .h64_filter_loop + mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f + punpcklwd xm1, xm0, xm0 + paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h + paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g + vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d + vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e + lea tlq, [rsp+254] + pavgw m2, m3 + paddw m0, m1 + paddw m0, m2 + psrlw m0, 2 + mova [r4-32], m0 +.h64_main: + neg maxbaseq + movd xm4, dyd + vpbroadcastw m6, [tlq+maxbaseq*2] + shl maxbased, 6 + vpbroadcastw m4, xm4 + lea r4d, [maxbaseq+dyq+15*64] + neg dyq + vpbroadcastd m7, [pw_m1024] + movd xm3, r4d + lea r4, [dyq+63] + paddw m8, m7, m7 + vpbroadcastw m3, xm3 + or maxbased, 63 + paddw m9, m8, m7 + psubw m3, [z_base_inc] +.h64_loop: + mov r5, r4 + sar r5, 6 + movu m1, [tlq+r5*2-128] + movu m0, [tlq+r5*2-126] + pand m2, m5, m3 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + sub rsp, 128 + paddw m0, m1 + pcmpgtw m1, m9, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*0], m0 + movu m1, [tlq+r5*2-96] + movu m0, [tlq+r5*2-94] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m8, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*1], m0 + movu m1, [tlq+r5*2-64] + movu m0, [tlq+r5*2-62] + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + pcmpgtw m1, m7, m3 + vpblendvb m0, m6, m0, m1 + mova [rsp+32*2], m0 + movu m1, [tlq+r5*2-32] + movu m0, [tlq+r5*2-30] + psubw m1, m0 + pmulhrsw m1, m2 + add r4, dyq + psraw m2, m3, 15 + paddw m3, m4 + paddw m0, m1 + vpblendvb m0, m6, m0, m2 + mova [rsp+32*3], m0 + dec wd + jz .h64_transpose + cmp r4d, maxbased + jg .h64_loop +.h64_end_loop: + sub rsp, 128 + mova [rsp+32*0], m6 + mova [rsp+32*1], m6 + mova [rsp+32*2], m6 + mova [rsp+32*3], m6 + dec wd + jg .h64_end_loop +.h64_transpose: + lea r2, [strideq*3] + lea r3, [strideq*5] + mov r5, dstq + lea r4, [strideq+r2*2] +.h64_transpose_loop0: + lea r6, [rsp+112] + lea dstq, [r5+org_wq*2-32] +.h64_transpose_loop: + mova xm0, [r6+128*15] + vinserti128 m0, [r6+128* 7], 1 + mova xm1, [r6+128*14] + vinserti128 m1, [r6+128* 6], 1 + mova xm2, [r6+128*13] + vinserti128 m2, [r6+128* 5], 1 + mova xm3, [r6+128*12] + vinserti128 m3, [r6+128* 4], 1 + mova xm4, [r6+128*11] + vinserti128 m4, [r6+128* 3], 1 + mova xm5, [r6+128*10] + vinserti128 m5, [r6+128* 2], 1 + mova xm6, [r6+128* 9] + vinserti128 m6, [r6+128* 1], 1 + mova xm7, [r6+128* 8] + vinserti128 m7, [r6+128* 0], 1 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + sub r6, 16 + punpckhdq m7, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m3, m5 + punpckldq m3, m5 + punpckhqdq m5, m7, m1 + punpcklqdq m7, m1 + punpckhqdq m1, m8, m3 + punpcklqdq m8, m3 + punpckhdq m3, m0, m2 + mova [dstq+strideq*0], m5 + punpckldq m0, m2 + mova [dstq+strideq*1], m7 + punpckhdq m2, m4, m6 + mova [dstq+strideq*2], m1 + punpckldq m4, m6 + mova [dstq+r2 ], m8 + punpckhqdq m6, m3, m2 + mova [dstq+strideq*4], m6 + punpcklqdq m3, m2 + mova [dstq+r3 ], m3 + punpckhqdq m2, m0, m4 + mova [dstq+r2*2 ], m2 + punpcklqdq m0, m4 + mova [dstq+r4 ], m0 + lea dstq, [dstq+strideq*8] + cmp r6, rsp + jae .h64_transpose_loop + add rsp, 128*16 + sub org_wd, 16 + jg .h64_transpose_loop0 +.h64_end: + RET + +%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax +%ifnum %4 + pshufb xm%2, xm%4 +%else + pshufb xm%2, %4 +%endif + vinserti128 m%2, xm%2, 1 + pshufd m%1, m%2, q0000 + pmaddwd m%1, m2 + pshufd m%3, m%2, q1111 + pmaddwd m%3, m3 + paddd m%1, m1 + paddd m%1, m%3 + pshufd m%3, m%2, q2222 + pmaddwd m%3, m4 + paddd m%1, m%3 + pshufd m%3, m%2, q3333 + pmaddwd m%3, m5 + paddd m%1, m%3 + psrad m%1, 4 + packusdw m%1, m%1 + pminsw m%1, m%5 +%endmacro + +%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax + pshufb m%2, m%6 + vpermq m%4, m%2, q3232 + vinserti128 m%2, xm%2, 1 + pshufd m%1, m%2, q0000 + pshufd m%3, m%4, q0000 + pmaddwd m%1, m2 + pmaddwd m%3, m2 + paddd m%1, m1 + paddd m%3, m1 + pshufd m%5, m%2, q1111 + pmaddwd m%5, m3 + paddd m%1, m%5 + pshufd m%5, m%4, q1111 + pmaddwd m%5, m3 + paddd m%3, m%5 + pshufd m%5, m%2, q2222 + pmaddwd m%5, m4 + paddd m%1, m%5 + pshufd m%5, m%4, q2222 + pmaddwd m%5, m4 + paddd m%3, m%5 + pshufd m%5, m%2, q3333 + pmaddwd m%5, m5 + paddd m%1, m%5 + pshufd m%5, m%4, q3333 + pmaddwd m%5, m5 + paddd m%3, m%5 + psrad m%1, 4 + psrad m%3, 4 + packusdw m%1, m%3 + pminsw m%1, m%7 +%endmacro + +; The ipred_filter SIMD processes 4x2 blocks in the following order which +; increases parallelism compared to doing things row by row. One redundant +; block is calculated for w8 and w16, two for w32. +; w4 w8 w16 w32 +; 1 1 2 1 2 3 5 1 2 3 5 b c d f +; 2 2 3 2 4 5 7 2 4 5 7 c e f h +; 3 3 4 4 6 7 9 4 6 7 9 e g h j +; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ +; 5 8 8 i + +cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter +%assign org_stack_offset stack_offset +%define base r6-ipred_filter_16bpc_avx2_table + lea r6, [filter_intra_taps] + tzcnt wd, wm +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + add filterq, r6 + lea r6, [ipred_filter_16bpc_avx2_table] + vbroadcasti128 m0, [tlq-6] + movsxd wq, [r6+wq*4] + vpbroadcastd m1, [base+pd_8] + pmovsxbw m2, [filterq+16*0] + pmovsxbw m3, [filterq+16*1] + pmovsxbw m4, [filterq+16*2] + pmovsxbw m5, [filterq+16*3] + add wq, r6 + mov hd, hm + jmp wq +.w4: + WIN64_SPILL_XMM 10 + mova xm8, [base+filter_shuf2] + vpbroadcastw m9, r8m ; bitdepth_max + lea r7, [6+hq*2] + sub tlq, r7 + jmp .w4_loop_start +.w4_loop: + pinsrq xm0, [tlq+hq*2], 0 + lea dstq, [dstq+strideq*2] +.w4_loop_start: + FILTER_1BLK 6, 0, 7, 8, 9 + vextracti128 xm0, m6, 1 + movq [dstq+strideq*0], xm6 + movq [dstq+strideq*1], xm0 + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + vbroadcasti128 m14, [base+filter_shuf3] + vpbroadcastw m15, r8m ; bitdepth_max + FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15 + vpermq m6, m10, q1302 ; ____ ____ | ____ 4321 + pslldq m8, m0, 4 + psrldq m7, m6, 2 + psrldq m0, m6, 10 + punpcklwd m7, m0 + vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321 + vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321 + vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321 + lea r7, [16+hq*2] + sub tlq, r7 + jmp .w8_loop_start +.w8_loop: + vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321 + vpermq m6, m9, q2031 + psrldq m0, m6, 2 + psrldq m6, 10 + punpcklwd m6, m0 + vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321 + vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321 + mova m10, m9 +.w8_loop_start: + vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321 + call .main + vpblendd m10, m9, 0xCC + mova [dstq+strideq*0], xm10 + vextracti128 [dstq+strideq*1], m10, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + %assign stack_offset stack_offset - stack_size_padded + ALLOC_STACK 32, 16 + vpbroadcastw m15, r8m ; bitdepth_max + sub hd, 2 + TAIL_CALL .w16_main, 0 +.w16_main: + mova xm10, [base+filter_shuf2] + FILTER_1BLK 13, 0, 6, 10, 15 + vpermq m12, m13, q3120 + mova xm14, [base+filter_shuf3] + vinserti128 m14, [base+filter_shuf1], 1 + vpbroadcastq m0, [tlq+10] + vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____ + psrldq m6, m12, 8 + vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321 + punpcklwd m6, m12 + vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321 + FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 + vpblendd m13, m12, 0xCC + vpermq m12, m12, q2031 ; 6___ 5___ + psrldq xm6, xm12, 2 + psrldq xm8, xm12, 12 + vpblendd xm6, xm8, 0x01 + pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ + FILTER_1BLK 11, 6, 8, 10, 15 + vpermq m11, m11, q3120 + pshufd m9, m11, q1032 + movu m8, [tlq+6] ; __43 210_ | ____ ____ + pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ + pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ + vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 + lea r7, [20+hq*2] + sub tlq, r7 + vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 + jmp .w16_loop_start +.w16_loop: + vpermq m13, m13, q3322 + vpermq m11, m9, q2020 + vpermq m9, m9, q1302 + vpermq m6, m12, q0123 + psrldq m7, 4 + vpblendd m13, m10, 0xCC + vpblendd m9, m7, 0x40 + mova m0, [rsp+8] + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 +.w16_loop_start: + mova m13, m12 + vpblendd m0, [tlq+hq*2], 0x0C + psrldq m7, m12, 8 + punpcklwd m7, m12 + vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 + vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 + FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 + vpermq m12, m10, q2031 + mova [rsp+8], m0 + psrldq m8, m11, 8 + psrldq xm6, xm12, 2 + psrldq xm7, xm12, 10 + psrldq xm0, xm13, 2 + punpcklwd m8, m11 + punpcklwd xm7, xm6 + vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 + vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 + vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 + call .main + vpermq m8, m11, q3120 + vpblendd m6, m8, m9, 0xCC + mova [dstq+strideq*0+16], xm6 + vextracti128 [dstq+strideq*1+16], m6, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + vpermq m8, m9, q3120 + vextracti128 xm0, m8, 1 ; 4321 ____ + pshufd xm11, xm11, q1032 + vpblendd xm0, xm11, 0x02 ; 4321 0___ + psrldq xm6, xm8, 2 + psrldq xm7, xm8, 12 + pblendw xm0, xm6, 0x4 ; 4321 05__ + pblendw xm0, xm7, 0x2 ; 4321 056_ + FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 + vpermq m12, m13, q1302 + vpblendd m12, m10, 0xCC + vpblendd m9, m6, 0xCC + mova [dstq+strideq*0+ 0], xm12 + mova [dstq+strideq*0+16], xm9 + vextracti128 [dstq+strideq*1+ 0], m12, 1 + vextracti128 [dstq+strideq*1+16], m9, 1 + ret +ALIGN function_align +.w32: + %assign stack_offset org_stack_offset + ALLOC_STACK 64, 16 + vpbroadcastw m15, r8m ; bitdepth_max + sub hd, 2 + lea r3, [dstq+32] + lea r5d, [hd*2+20] + call .w16_main + mov dstq, r3 + lea tlq, [tlq+r5+32] + sub r5d, 20 + shr r5d, 1 + sub r5d, 2 + lea r4, [dstq+strideq*2-2] +DEFINE_ARGS dst, stride, tl, stride3, left, h + lea stride3q, [strideq*3] + movu m8, [tlq-6] ; 4321 0___ + mova xm10, [base+filter_shuf2] + pinsrw xm0, xm8, [dstq+strideq*0-2], 2 + pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_ + pinsrw xm9, [leftq+strideq*0], 5 + pinsrw xm9, [leftq+strideq*1], 4 + FILTER_1BLK 13, 0, 6, 10, 15 + vpermq m12, m13, q3120 + mova xm14, [base+filter_shuf3] + vinserti128 m14, [base+filter_shuf1], 1 + psrldq m6, m12, 8 + punpcklwd m7, m6, m12 + vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321 + vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321 + vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321 + vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321 + FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 + vpblendd m13, m12, 0xCC + pinsrw xm9, [leftq+strideq*2], 3 + pinsrw xm9, [leftq+stride3q ], 2 + lea leftq, [leftq+strideq*4] + pinsrw xm9, [leftq+strideq*0], 1 + pinsrw xm9, [leftq+strideq*1], 0 + movq [rsp+32], xm9 + mov r7d, 1 + pslldq m8, m9, 4 + vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____ + vpermq m12, m12, q2031 ; 6___ 5___ + psrldq xm6, xm12, 2 + psrldq xm7, xm12, 12 + vpblendd xm6, xm7, 0x01 ; ____ _56_ + pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ + FILTER_1BLK 11, 6, 7, 10, 15 + vpermq m11, m11, q3120 + pshufd m9, m11, q1032 + vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____ + pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ + pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ + vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 + vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 + jmp .w32_loop_start +.w32_loop_last: + mova m0, [rsp+0] + jmp .w32_loop +.w32_loop_left: + mova m0, [rsp+0] + vpblendd m0, [rsp+32+r7*4-12], 0x0C + dec r7d + jg .w32_loop + cmp hd, 2 + je .w32_loop + pinsrw xm6, [rsp+32], 6 + pinsrw xm6, [leftq+strideq*2], 5 + pinsrw xm6, [leftq+stride3q ], 4 + lea leftq, [leftq+strideq*4] + pinsrw xm6, [leftq+strideq*0], 3 + pinsrw xm6, [leftq+strideq*1], 2 + pinsrw xm6, [leftq+strideq*2], 1 + pinsrw xm6, [leftq+stride3q ], 0 + lea leftq, [leftq+strideq*4] + movu [rsp+36], xm6 + pinsrw xm6, [leftq+strideq*0], 1 + pinsrw xm6, [leftq+strideq*1], 0 + movd [rsp+32], xm6 + mov r7d, 4 +.w32_loop: + vpermq m13, m13, q3322 + vpermq m11, m9, q2020 + vpermq m9, m9, q1302 + vpermq m6, m12, q0123 + psrldq m7, 4 + vpblendd m13, m10, 0xCC + vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321 + mova [dstq+strideq*0], xm13 + vextracti128 [dstq+strideq*1], m13, 1 +.w32_loop_start: + mova m13, m12 + psrldq m7, m12, 8 + punpcklwd m7, m12 + vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 + vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 + FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 + vpermq m12, m10, q2031 + mova [rsp+0], m0 + psrldq m8, m11, 8 + psrldq xm6, xm12, 2 + psrldq xm7, xm12, 10 + psrldq xm0, xm13, 2 + punpcklwd m8, m11 + punpcklwd xm7, xm6 + vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 + vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 + vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 + call .main + vpermq m8, m11, q3120 + vpblendd m6, m8, m9, 0xCC + mova [dstq+strideq*0+16], xm6 + vextracti128 [dstq+strideq*1+16], m6, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop_left + jz .w32_loop_last + vpermq m8, m9, q3120 + vextracti128 xm0, m8, 1 ; 4321 ____ + pshufd xm11, xm11, q1032 + vpblendd xm0, xm11, 0x02 ; 4321 0___ + psrldq xm6, xm8, 2 + psrldq xm7, xm8, 12 + pblendw xm0, xm6, 0x4 ; 4321 05__ + pblendw xm0, xm7, 0x2 ; 4321 056_ + FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 + vpermq m12, m13, q1302 + vpblendd m12, m10, 0xCC + vpblendd m9, m6, 0xCC + mova [dstq+strideq*0+ 0], xm12 + mova [dstq+strideq*0+16], xm9 + vextracti128 [dstq+strideq*1+ 0], m12, 1 + vextracti128 [dstq+strideq*1+16], m9, 1 + RET +.main: + FILTER_2BLK 9, 8, 6, 7, 0, 14, 15 + ret + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +%macro IPRED_CFL 1 ; ac in, unpacked pixels out + psignw m3, m%1, m1 + pabsw m%1, m%1 + pmulhrsw m%1, m2 + psignw m%1, m3 + paddw m%1, m0 +%endmacro + +cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + add tlq, 2 + movd xm4, wd + pxor m6, m6 + vpbroadcastw m7, r7m + pavgw xm4, xm6 + tzcnt wd, wd + movd xm5, wd + movu m0, [tlq] + lea t0, [ipred_cfl_left_16bpc_avx2_table] + movsxd r6, [t0+wq*4] + add r6, t0 + add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 + +cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + mov hd, hm ; zero upper half + sub tlq, hq + movd xm4, hd + sub tlq, hq + pxor m6, m6 + vpbroadcastw m7, r7m + pavgw xm4, xm6 + tzcnt r6d, hd + movd xm5, r6d + movu m0, [tlq] + lea t0, [ipred_cfl_left_16bpc_avx2_table] + movsxd r6, [t0+r6*4] + add r6, t0 + add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table + tzcnt wd, wd + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h32: + paddw m0, [tlq+32] +.h16: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h8: + psrldq xm1, xm0, 8 + paddw xm0, xm1 +.h4: + punpcklwd xm0, xm6 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + paddd xm0, xm4 + psrld xm0, xm5 + vpbroadcastw m0, xm0 + jmp wq + +cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd xm4, t0d + tzcnt t0d, t0d + movd xm5, t0d + lea t0, [ipred_cfl_16bpc_avx2_table] + tzcnt wd, wd + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+4*4] + psrlw xm4, 1 + pxor m6, m6 + vpbroadcastw m7, r7m + add r6, t0 + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h4: + movq xm0, [tlq-8] + jmp wq +.w4: + movq xm1, [tlq+2] + paddw m0, m4 + paddw m0, m1 + psrlq m1, m0, 32 + paddw m0, m1 + psrld m1, m0, 16 + paddw m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + lea r2d, [hq*2] + mov r6d, 0xAAAB6667 + shrx r6d, r6d, r2d + punpckhwd xm1, xm0, xm6 + punpcklwd xm0, xm6 + paddd xm0, xm1 + movd xm1, r6d + psrld xm0, 2 + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w4_end: + vpbroadcastw m0, xm0 +.s4: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq] + IPRED_CFL 4 + pmaxsw m4, m6 + pminsw m4, m7 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm5 + movhps [dstq+strideq*1], xm4 + movhps [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + add acq, 32 + sub hd, 4 + jg .s4_loop + RET +ALIGN function_align +.h8: + mova xm0, [tlq-16] + jmp wq +.w8: + vextracti128 xm1, m0, 1 + paddw xm0, [tlq+2] + paddw xm0, xm4 + paddw xm0, xm1 + psrld xm1, xm0, 16 + paddw xm0, xm1 + pblendw xm0, xm6, 0xAA + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w8_end: + vpbroadcastw m0, xm0 +.s8: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + pmaxsw m4, m6 + pmaxsw m5, m6 + pminsw m4, m7 + pminsw m5, m7 + mova [dstq+strideq*0], xm4 + mova [dstq+strideq*2], xm5 + vextracti128 [dstq+strideq*1], m4, 1 + vextracti128 [dstq+r6 ], m5, 1 + lea dstq, [dstq+strideq*4] + add acq, 64 + sub hd, 4 + jg .s8_loop + RET +ALIGN function_align +.h16: + mova m0, [tlq-32] + jmp wq +.w16: + paddw m0, [tlq+2] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpckhwd xm1, xm0, xm6 + punpcklwd xm0, xm6 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w16_end: + vpbroadcastw m0, xm0 +.s16: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + pmaxsw m4, m6 + pmaxsw m5, m6 + pminsw m4, m7 + pminsw m5, m7 + mova [dstq+strideq*0], m4 + mova [dstq+strideq*1], m5 + lea dstq, [dstq+strideq*2] + add acq, 64 + sub hd, 2 + jg .s16_loop + RET +ALIGN function_align +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-32] + jmp wq +.w32: + paddw m0, [tlq+ 2] + paddw m0, [tlq+34] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm6 + punpckhwd xm0, xm6 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x6667AAAB + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w32_end: + vpbroadcastw m0, xm0 +.s32: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + pmaxsw m4, m6 + pmaxsw m5, m6 + pminsw m4, m7 + pminsw m5, m7 + mova [dstq+32*0], m4 + mova [dstq+32*1], m5 + add dstq, strideq + add acq, 64 + dec hd + jg .s32_loop + RET + +cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + mov r6d, r7m + shr r6d, 11 + lea t0, [ipred_cfl_splat_16bpc_avx2_table] + tzcnt wd, wd + movifnidn hd, hm + movsxd wq, [t0+wq*4] + vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4] + pxor m6, m6 + vpbroadcastw m7, r7m + add wq, t0 + movifnidn acq, acmp + jmp wq + +cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h + movifnidn hpadd, hpadm + vpbroadcastd m5, [pw_2] + mov hd, hm + shl hpadd, 2 + pxor m4, m4 + sub hd, hpadd + cmp dword wm, 8 + jg .w16 + je .w8 +.w4: + lea r3, [strideq*3] + mov r5, acq +.w4_loop: + mova xm0, [ypxq+strideq*2] + mova xm1, [ypxq+r3 ] + vinserti128 m0, [ypxq+strideq*0], 1 + vinserti128 m1, [ypxq+strideq*1], 1 + lea ypxq, [ypxq+strideq*4] + pmaddwd m0, m5 + pmaddwd m1, m5 + paddd m0, m1 + vextracti128 xm1, m0, 1 + paddd m4, m0 + packssdw xm1, xm0 + mova [acq], xm1 + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .dc + vpermq m1, m1, q1111 + pslld xm0, 2 +.w4_hpad_loop: + mova [acq], m1 + paddd m4, m0 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .dc +.w8: + mov r5, acq + test wpadd, wpadd + jnz .w8_wpad1 +.w8_loop: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + paddd m0, m1 + vextracti128 xm1, m0, 1 + paddd m4, m0 + packssdw xm1, xm0, xm1 + mova [acq], xm1 + add acq, 16 + dec hd + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz .dc + vinserti128 m1, xm1, 1 + pslld m0, 2 + jmp .hpad +.w8_wpad1: + pmaddwd xm0, xm5, [ypxq+strideq*0] + pmaddwd xm3, xm5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + paddd xm0, xm3 + pshufd xm3, xm0, q3333 + packssdw xm1, xm0, xm3 + paddd xm0, xm3 + paddd xm4, xm0 + mova [acq], xm1 + add acq, 16 + dec hd + jg .w8_wpad1 + jmp .w8_hpad +.w16_wpad: + mova m0, [ypxq+strideq*0+ 0] + mova m1, [ypxq+strideq*1+ 0] + cmp wpadd, 2 + jl .w16_wpad1 + je .w16_wpad2 + vpbroadcastd m2, [ypxq+strideq*0+12] + vpbroadcastd m3, [ypxq+strideq*1+12] + vpblendd m0, m2, 0xf0 + vpblendd m1, m3, 0xf0 + jmp .w16_wpad_end +.w16_wpad2: + vpbroadcastd m2, [ypxq+strideq*0+28] + vpbroadcastd m3, [ypxq+strideq*1+28] + jmp .w16_wpad_end +.w16_wpad1: + vpbroadcastd m2, [ypxq+strideq*0+44] + vpbroadcastd m3, [ypxq+strideq*1+44] + vinserti128 m2, [ypxq+strideq*0+32], 0 + vinserti128 m3, [ypxq+strideq*1+32], 0 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + REPX {pmaddwd x, m5}, m0, m1, m2, m3 + paddd m0, m1 + paddd m2, m3 + packssdw m1, m0, m2 + paddd m0, m2 + vpermq m1, m1, q3120 + paddd m4, m0 + mova [acq], m1 + add acq, 32 + dec hd + jg .w16_wpad + jmp .w16_hpad +.w16: + mov r5, acq + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + pmaddwd m0, m5, [ypxq+strideq*0+ 0] + pmaddwd m2, m5, [ypxq+strideq*0+32] + pmaddwd m1, m5, [ypxq+strideq*1+ 0] + pmaddwd m3, m5, [ypxq+strideq*1+32] + lea ypxq, [ypxq+strideq*2] + paddd m0, m1 + paddd m2, m3 + packssdw m1, m0, m2 + paddd m0, m2 + vpermq m1, m1, q3120 + paddd m4, m0 + mova [acq], m1 + add acq, 32 + dec hd + jg .w16_loop +.w16_hpad: + add hpadd, hpadd + jz .dc + paddd m0, m0 +.hpad: + mova [acq+32*0], m1 + paddd m4, m0 + mova [acq+32*1], m1 + add acq, 32*2 + sub hpadd, 4 + jg .hpad +.dc: + vextracti128 xm1, m4, 1 + sub r5, acq ; -w*h*2 + tzcnt r1d, r5d + paddd xm4, xm1 + sub r1d, 2 + punpckhqdq xm1, xm4, xm4 + movd xm0, r1d + paddd xm1, xm4 + pshuflw xm4, xm1, q1032 + paddd xm1, xm4 + psrld xm1, xm0 + pxor xm0, xm0 + pavgw xm1, xm0 + vpbroadcastw m1, xm1 +.dc_loop: + mova m0, [acq+r5] + psubw m0, m1 + mova [acq+r5], m0 + add r5, 32 + jl .dc_loop + RET + +cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h + movifnidn hpadd, hpadm + vpbroadcastd m5, [pw_4] + mov hd, hm + shl hpadd, 2 + pxor m4, m4 + sub hd, hpadd + cmp dword wm, 8 + jg .w16 + je .w8 +.w4: + lea r3, [strideq*3] + mov r5, acq +.w4_loop: + mova xm0, [ypxq+strideq*0] + mova xm1, [ypxq+strideq*1] + vinserti128 m0, [ypxq+strideq*2], 1 + vinserti128 m1, [ypxq+r3 ], 1 + lea ypxq, [ypxq+strideq*4] + pmaddwd m0, m5 + pmaddwd m1, m5 + paddd m4, m0 + packssdw m0, m1 + paddd m4, m1 + mova [acq], m0 + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc + vextracti128 xm1, m1, 1 + vpermq m0, m0, q3333 + pslld xm1, 2 +.w4_hpad_loop: + mova [acq], m0 + paddd m4, m1 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc +.w8: + mov r5, acq + test wpadd, wpadd + jnz .w8_wpad1 +.w8_loop: + pmaddwd m1, m5, [ypxq+strideq*0] + pmaddwd m0, m5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + paddd m4, m1 + packssdw m1, m0 + paddd m4, m0 + vpermq m2, m1, q3120 + mova [acq], m2 + add acq, 32 + sub hd, 2 + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc + vpermq m1, m1, q3131 + pslld m0, 2 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad +.w8_wpad1: + vpbroadcastd m1, [ypxq+strideq*0+12] + vpbroadcastd m0, [ypxq+strideq*1+12] + vinserti128 m1, [ypxq+strideq*0+ 0], 0 + vinserti128 m0, [ypxq+strideq*1+ 0], 0 + lea ypxq, [ypxq+strideq*2] + pmaddwd m1, m5 + pmaddwd m0, m5 + paddd m4, m1 + packssdw m1, m0 + paddd m4, m0 + vpermq m2, m1, q3120 + mova [acq], m2 + add acq, 32 + sub hd, 2 + jg .w8_wpad1 + jmp .w8_hpad +.w16: + mov r5, acq + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + pmaddwd m2, m5, [ypxq+strideq*0+ 0] + pmaddwd m1, m5, [ypxq+strideq*0+32] + pmaddwd m0, m5, [ypxq+strideq*1+ 0] + pmaddwd m3, m5, [ypxq+strideq*1+32] + lea ypxq, [ypxq+strideq*2] + paddd m4, m2 + packssdw m2, m1 + paddd m4, m1 + packssdw m1, m0, m3 + paddd m0, m3 + vpermq m2, m2, q3120 + paddd m4, m0 + vpermq m1, m1, q3120 + mova [acq+32*0], m2 + mova [acq+32*1], m1 + add acq, 32*2 + sub hd, 2 + jg .w16_loop + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad +.w16_wpad: + mova m2, [ypxq+strideq*0+ 0] + mova m0, [ypxq+strideq*1+ 0] + cmp wpadd, 2 + jl .w16_wpad1 + je .w16_wpad2 + vpbroadcastd m1, [ypxq+strideq*0+12] + vpbroadcastd m3, [ypxq+strideq*1+12] + vpblendd m2, m1, 0xf0 + vpblendd m0, m3, 0xf0 + jmp .w16_wpad_end +.w16_wpad2: + vpbroadcastd m1, [ypxq+strideq*0+28] + vpbroadcastd m3, [ypxq+strideq*1+28] + jmp .w16_wpad_end +.w16_wpad1: + vpbroadcastd m1, [ypxq+strideq*0+44] + vpbroadcastd m3, [ypxq+strideq*1+44] + vinserti128 m1, [ypxq+strideq*0+32], 0 + vinserti128 m3, [ypxq+strideq*1+32], 0 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + REPX {pmaddwd x, m5}, m2, m0, m1, m3 + paddd m4, m2 + packssdw m2, m1 + paddd m4, m1 + packssdw m1, m0, m3 + paddd m0, m3 + vpermq m2, m2, q3120 + paddd m4, m0 + vpermq m1, m1, q3120 + mova [acq+32*0], m2 + mova [acq+32*1], m1 + add acq, 32*2 + sub hd, 2 + jg .w16_wpad + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad + +cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h + lea r6, [ipred_cfl_ac_444_16bpc_avx2_table] + tzcnt wd, wm + movifnidn hpadd, hpadm + vpbroadcastd m5, [pw_1] + movsxd wq, [r6+wq*4] + shl hpadd, 2 + add wq, r6 + mov hd, hm + pxor m4, m4 + sub hd, hpadd + jmp wq +.w4: + lea r3, [strideq*3] + mov r5, acq +.w4_loop: + movq xm0, [ypxq+strideq*0] + movhps xm0, [ypxq+strideq*1] + vpbroadcastq m1, [ypxq+strideq*2] + vpbroadcastq m2, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + vpblendd m0, m1, 0x30 + vpblendd m0, m2, 0xc0 + psllw m0, 3 + pmaddwd m1, m0, m5 + mova [acq], m0 + add acq, 32 + paddd m4, m1 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc + vpermq m0, m0, q3333 + paddd m1, m1 + mova [acq+32*0], m0 + vpermq m1, m1, q3333 + mova [acq+32*1], m0 + add acq, 32*2 + paddd m4, m1 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc +.w8: + lea r3, [strideq*3] + mov r5, acq +.w8_loop: + mova xm2, [ypxq+strideq*0] + vinserti128 m2, [ypxq+strideq*1], 1 + mova xm1, [ypxq+strideq*2] + vinserti128 m1, [ypxq+r3 ], 1 + lea ypxq, [ypxq+strideq*4] + psllw m2, 3 + psllw m1, 3 + mova [acq+32*0], m2 + pmaddwd m2, m5 + mova [acq+32*1], m1 + pmaddwd m0, m1, m5 + add acq, 32*2 + paddd m4, m2 + paddd m4, m0 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc + vperm2i128 m1, m1, 0x11 + pslld m0, 2 + pxor m2, m2 + vpblendd m0, m2, 0x0f + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad +.w16_wpad2: + vpbroadcastw m3, [ypxq+strideq*0+14] + vpbroadcastw m0, [ypxq+strideq*1+14] + vpblendd m2, m3, 0xf0 + vpblendd m1, m0, 0xf0 + jmp .w16_wpad_end +.w16: + mov r5, acq +.w16_loop: + mova m2, [ypxq+strideq*0] + mova m1, [ypxq+strideq*1] + test wpadd, wpadd + jnz .w16_wpad2 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + psllw m2, 3 + psllw m1, 3 + mova [acq+32*0], m2 + pmaddwd m2, m5 + mova [acq+32*1], m1 + pmaddwd m0, m1, m5 + add acq, 32*2 + paddd m4, m2 + paddd m4, m0 + sub hd, 2 + jg .w16_loop + add hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc + paddd m0, m0 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad +.w32: + mov r5, acq + test wpadd, wpadd + jnz .w32_wpad +.w32_loop: + mova m0, [ypxq+ 0] + mova m1, [ypxq+32] + add ypxq, strideq + psllw m0, 3 + psllw m1, 3 + pmaddwd m2, m0, m5 + mova [acq+32*0], m0 + pmaddwd m3, m1, m5 + mova [acq+32*1], m1 + add acq, 32*2 + paddd m2, m3 + paddd m4, m2 + dec hd + jg .w32_loop +.w32_hpad: + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc + paddd m2, m2 +.w32_hpad_loop: + mova [acq+32*0], m0 + mova [acq+32*1], m1 + paddd m4, m2 + mova [acq+32*2], m0 + mova [acq+32*3], m1 + add acq, 32*4 + sub hpadd, 2 + jg .w32_hpad_loop + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc +.w32_wpad: + mova m0, [ypxq+ 0] + cmp wpadd, 4 + jl .w32_wpad2 + je .w32_wpad4 + vpbroadcastw m1, [ypxq+14] + vpblendd m0, m1, 0xf0 + jmp .w32_wpad_end +.w32_wpad4: + vpbroadcastw m1, [ypxq+30] + jmp .w32_wpad_end +.w32_wpad2: + vpbroadcastw m1, [ypxq+46] + vinserti128 m1, [ypxq+32], 0 +.w32_wpad_end: + add ypxq, strideq + psllw m0, 3 + psllw m1, 3 + pmaddwd m2, m0, m5 + mova [acq+32*0], m0 + pmaddwd m3, m1, m5 + mova [acq+32*1], m1 + add acq, 32*2 + paddd m2, m3 + paddd m4, m2 + dec hd + jg .w32_wpad + jmp .w32_hpad + +cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h + vbroadcasti128 m3, [palq] + lea r2, [pal_pred_16bpc_avx2_table] + tzcnt wd, wm + vbroadcasti128 m4, [pal_pred_shuf] + movifnidn hd, hm + movsxd wq, [r2+wq*4] + pshufb m3, m4 + punpckhqdq m4, m3, m3 + add wq, r2 +DEFINE_ARGS dst, stride, stride3, idx, w, h + lea stride3q, [strideq*3] + jmp wq +.w4: + mova xm2, [idxq] + add idxq, 16 + pshufb xm1, xm3, xm2 + pshufb xm2, xm4, xm2 + punpcklbw xm0, xm1, xm2 + punpckhbw xm1, xm2 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+strideq*1], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + movu m2, [idxq] ; only 16-byte alignment + add idxq, 32 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*2], m0, 1 + vextracti128 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + vpermq m2, [idxq+ 0], q3120 + vpermq m5, [idxq+32], q3120 + add idxq, 64 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + pshufb m1, m3, m5 + pshufb m2, m4, m5 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpermq m2, [idxq+ 0], q3120 + vpermq m5, [idxq+32], q3120 + add idxq, 64 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0+ 0], m0 + mova [dstq+strideq*0+32], m1 + pshufb m1, m3, m5 + pshufb m2, m4, m5 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*1+ 0], m0 + mova [dstq+strideq*1+32], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + vpermq m2, [idxq+ 0], q3120 + vpermq m5, [idxq+32], q3120 + add idxq, 64 + pshufb m1, m3, m2 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+ 0], m0 + mova [dstq+32], m1 + pshufb m1, m3, m5 + pshufb m2, m4, m5 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+64], m0 + mova [dstq+96], m1 + add dstq, strideq + dec hd + jg .w64 + RET + +%endif diff --git a/third_party/dav1d/src/x86/ipred16_avx512.asm b/third_party/dav1d/src/x86/ipred16_avx512.asm new file mode 100644 index 0000000000..1a307adc98 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred16_avx512.asm @@ -0,0 +1,833 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3 + db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11 + db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7 + db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15 +smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 + db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 + db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 +pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 + db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 + db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 + db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 +filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5 + times 4 db 10, 11, 12, 13, 2, 3, -1, -1 +filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7 + times 4 db 26, 27, 28, 29, 14, 15, -1, -1 +filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9 +pw_1: times 2 dw 1 + dd 10 +filter_rnd: dd 32 + dd 1 + dd 8 + dd 11 +filter_shift: times 2 dw 6 + dd 0 + times 2 dw 4 + dd 9 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 + +cextern smooth_weights_1d_16bpc +cextern smooth_weights_2d_16bpc +cextern filter_intra_taps + +SECTION .text + +%macro PAETH 3 ; top, signed_ldiff, ldiff + paddw m0, m%2, m2 + psubw m1, m0, m3 ; tldiff + psubw m0, m%1 ; tdiff + pabsw m1, m1 + pabsw m0, m0 + pcmpgtw k1, m0, m1 + pminsw m0, m1 + pcmpgtw k2, m%3, m0 + vpblendmw m0{k1}, m%1, m3 + vpblendmw m0{k2}, m2, m0 +%endmacro + +INIT_ZMM avx512icl +cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h +%define base r6-ipred_paeth_16bpc_avx512icl_table + lea r6, [ipred_paeth_16bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastw m3, [tlq] ; topleft + add wq, r6 + jmp wq +.w4: + vpbroadcastq m4, [tlq+2] ; top + movsldup m7, [base+ipred_shuf] + lea r6, [strideq*3] + psubw m5, m4, m3 + pabsw m6, m5 +.w4_loop: + sub tlq, 16 + vbroadcasti32x4 m2, [tlq] + pshufb m2, m7 ; left + PAETH 4, 5, 6 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm8, ym0, 1 + vextracti32x4 xm9, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm8 + movq [dstq+r6 ], xm9 + sub hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm8 + movhps [dstq+r6 ], xm9 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.w8: + vbroadcasti32x4 m4, [tlq+2] + movsldup m7, [base+ipred_shuf] + lea r6, [strideq*3] + psubw m5, m4, m3 + pabsw m6, m5 +.w8_loop: + sub tlq, 8 + vpbroadcastq m2, [tlq] + pshufb m2, m7 + PAETH 4, 5, 6 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+r6 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + vbroadcasti32x8 m4, [tlq+2] + movsldup m7, [base+ipred_shuf] + psubw m5, m4, m3 + pabsw m6, m5 +.w16_loop: + sub tlq, 4 + vpbroadcastd m2, [tlq] + pshufb m2, m7 + PAETH 4, 5, 6 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: + movu m4, [tlq+2] + psubw m5, m4, m3 + pabsw m6, m5 +.w32_loop: + sub tlq, 2 + vpbroadcastw m2, [tlq] + PAETH 4, 5, 6 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +.w64: + movu m4, [tlq+ 2] + movu m7, [tlq+66] + psubw m5, m4, m3 + psubw m8, m7, m3 + pabsw m6, m5 + pabsw m9, m8 +.w64_loop: + sub tlq, 2 + vpbroadcastw m2, [tlq] + PAETH 4, 5, 6 + mova [dstq+64*0], m0 + PAETH 7, 8, 9 + mova [dstq+64*1], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 +%define base r6-$$ + lea r6, [$$] + tzcnt wd, wm + mov hd, hm + movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4] + lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] + neg hq + vpbroadcastw m6, [tlq+hq*2] ; bottom + lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq] + lea stride3q, [strideq*3] + jmp wq +.w4: + vpbroadcastq m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w4_loop: + vbroadcasti32x4 m3, [weightsq+hq*2] + pshufb m3, m4 + pmulhrsw m3, m5 + paddw m3, m6 + vextracti32x4 xm0, m3, 3 + vextracti32x4 xm1, ym3, 1 + vextracti32x4 xm2, m3, 2 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 + add hq, 8 + jg .end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + jl .w4_loop +.end: + RET +.w8: + vbroadcasti32x4 m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w8_loop: + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m4 + pmulhrsw m0, m5 + paddw m0, m6 + vextracti32x4 [dstq+strideq*0], m0, 3 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +.w16: + vbroadcasti32x8 m5, [tlq+2] ; top + movsldup m4, [ipred_shuf] + psubw m5, m6 ; top - bottom +.w16_loop: + vpbroadcastd m0, [weightsq+hq*2+0] + vpbroadcastd m1, [weightsq+hq*2+4] + pshufb m0, m4 + pshufb m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m6 + paddw m1, m6 + vextracti32x8 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], ym0 + vextracti32x8 [dstq+strideq*2], m1, 1 + mova [dstq+stride3q ], ym1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w16_loop + RET +.w32: + movu m5, [tlq+2] + psubw m5, m6 +.w32_loop: + vpbroadcastw m0, [weightsq+hq*2+0] + vpbroadcastw m1, [weightsq+hq*2+2] + vpbroadcastw m2, [weightsq+hq*2+4] + vpbroadcastw m3, [weightsq+hq*2+6] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w32_loop + RET +.w64: + movu m4, [tlq+ 2] + movu m5, [tlq+66] + psubw m4, m6 + psubw m5, m6 +.w64_loop: + vpbroadcastw m1, [weightsq+hq*2+0] + vpbroadcastw m3, [weightsq+hq*2+2] + pmulhrsw m0, m4, m1 + pmulhrsw m1, m5 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m5 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + mova [dstq+strideq*1+64*0], m2 + mova [dstq+strideq*1+64*1], m3 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w64_loop + RET + +cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 + lea r6, [$$] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m6, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4] + sub tlq, hq + lea stride3q, [strideq*3] + lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq] + jmp wq +.w4: + movsldup m4, [base+ipred_shuf] + vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2] +.w4_loop: + vbroadcasti32x4 m0, [tlq+hq-16] ; left + pshufb m0, m4 + psubw m0, m6 ; left - right + pmulhrsw m0, m5 + paddw m0, m6 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 + sub hd, 8*2 + jl .end + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.end: + RET +.w8: + movsldup m4, [base+ipred_shuf] + vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2] +.w8_loop: + vpbroadcastq m0, [tlq+hq-8] ; left + pshufb m0, m4 + psubw m0, m6 ; left - right + pmulhrsw m0, m5 + paddw m0, m6 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w8_loop + RET +.w16: + movsldup m4, [base+ipred_shuf] + vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2] +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastd m1, [tlq+hq-8] + pshufb m0, m4 + pshufb m1, m4 + psubw m0, m6 + psubw m1, m6 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m6 + paddw m1, m6 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w16_loop + RET +.w32: + movu m5, [base+smooth_weights_1d_16bpc+32*2] +.w32_loop: + vpbroadcastq m3, [tlq+hq-8] + punpcklwd m3, m3 + psubw m3, m6 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hq, 4*2 + jg .w32_loop + RET +.w64: + movu m4, [base+smooth_weights_1d_16bpc+64*2] + movu m5, [base+smooth_weights_1d_16bpc+64*3] +.w64_loop: + vpbroadcastw m1, [tlq+hq-2] + vpbroadcastw m3, [tlq+hq-4] + psubw m1, m6 + psubw m3, m6 + pmulhrsw m0, m4, m1 + pmulhrsw m1, m5 + pmulhrsw m2, m4, m3 + pmulhrsw m3, m5 + REPX {paddw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + mova [dstq+strideq*1+64*0], m2 + mova [dstq+strideq*1+64*1], m3 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w64_loop + RET + +cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 + lea r6, [$$] + mov wd, wm + movifnidn hd, hm + vpbroadcastw m13, [tlq+wq*2] ; right + tzcnt wd, wd + add hd, hd + movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4] + mov r5d, 0x55555555 + sub tlq, hq + mova m14, [base+smooth_perm] + kmovd k1, r5d + vpbroadcastw m0, [tlq] ; bottom + mov r5, 0x3333333333333333 + pxor m15, m15 + lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq] + kmovq k2, r5 + lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2] + jmp wq +.w4: + vpbroadcastq m5, [tlq+hq+2] + movshdup m3, [base+ipred_shuf] + movsldup m4, [base+ipred_shuf] + vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4] + lea stride3q, [strideq*3] + punpcklwd m5, m0 ; top, bottom +.w4_loop: + vbroadcasti32x4 m0, [v_weightsq] + vpbroadcastq m2, [tlq+hq-8] + mova m1, m13 + pshufb m0, m3 + pmaddwd m0, m5 + pshufb m1{k2}, m2, m4 ; left, right + vpdpwssd m0, m1, m6 + vpermb m0, m14, m0 + pavgw ym0, ym15 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + add v_weightsq, 4*4 + sub hd, 4*2 + jg .w4_loop + RET +.w8: + vbroadcasti32x4 ym5, [tlq+hq+2] + movshdup m6, [base+ipred_shuf] + movsldup m7, [base+ipred_shuf] + pmovzxwd m5, ym5 + vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4] + lea stride3q, [strideq*3] + vpblendmw m5{k1}, m0, m5 ; top, bottom +.w8_loop: + vpbroadcastq m0, [v_weightsq+0] + vpbroadcastq m1, [v_weightsq+8] + vpbroadcastd m3, [tlq+hq-4] + vpbroadcastd m4, [tlq+hq-8] + pshufb m0, m6 + pmaddwd m0, m5 + pshufb m1, m6 + pmaddwd m1, m5 + mova m2, m13 + pshufb m2{k2}, m3, m7 ; left, right + mova m3, m13 + pshufb m3{k2}, m4, m7 + vpdpwssd m0, m2, m8 + vpdpwssd m1, m3, m8 + add v_weightsq, 4*4 + vpermt2b m0, m14, m1 + pavgw m0, m15 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w8_loop + RET +.w16: + pmovzxwd m5, [tlq+hq+2] + mova m6, [base+smooth_weights_2d_16bpc+16*4] + vpblendmw m5{k1}, m0, m5 ; top, bottom +.w16_loop: + vpbroadcastd m0, [v_weightsq+0] + vpbroadcastd m1, [v_weightsq+4] + pmaddwd m0, m5 + pmaddwd m1, m5 + mova m2, m13 + vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right + mova m3, m13 + vpbroadcastw m3{k1}, [tlq+hq-4] + vpdpwssd m0, m2, m6 + vpdpwssd m1, m3, m6 + add v_weightsq, 2*4 + vpermt2b m0, m14, m1 + pavgw m0, m15 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w16_loop + RET +.w32: + pmovzxwd m5, [tlq+hq+ 2] + pmovzxwd m6, [tlq+hq+34] + mova m7, [base+smooth_weights_2d_16bpc+32*4] + mova m8, [base+smooth_weights_2d_16bpc+32*6] + vpblendmw m5{k1}, m0, m5 ; top, bottom + vpblendmw m6{k1}, m0, m6 +.w32_loop: + vpbroadcastd m2, [v_weightsq+0] + vpbroadcastd m3, [v_weightsq+4] + pmaddwd m0, m5, m2 + pmaddwd m2, m6 + pmaddwd m1, m5, m3 + pmaddwd m3, m6 + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right + vpdpwssd m0, m4, m7 + vpdpwssd m2, m4, m8 + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-4] + vpdpwssd m1, m4, m7 + vpdpwssd m3, m4, m8 + add v_weightsq, 2*4 + vpermt2b m0, m14, m2 + vpermt2b m1, m14, m3 + pavgw m0, m15 + pavgw m1, m15 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hq, 2*2 + jg .w32_loop + RET +.w64: + pmovzxwd m5, [tlq+hq+ 2] + pmovzxwd m6, [tlq+hq+34] + pmovzxwd m7, [tlq+hq+66] + pmovzxwd m8, [tlq+hq+98] + mova m9, [base+smooth_weights_2d_16bpc+64*4] + vpblendmw m5{k1}, m0, m5 ; top, bottom + mova m10, [base+smooth_weights_2d_16bpc+64*5] + vpblendmw m6{k1}, m0, m6 + mova m11, [base+smooth_weights_2d_16bpc+64*6] + vpblendmw m7{k1}, m0, m7 + mova m12, [base+smooth_weights_2d_16bpc+64*7] + vpblendmw m8{k1}, m0, m8 +.w64_loop: + vpbroadcastd m3, [v_weightsq] + mova m4, m13 + vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right + pmaddwd m0, m5, m3 + pmaddwd m2, m6, m3 + pmaddwd m1, m7, m3 + pmaddwd m3, m8 + vpdpwssd m0, m4, m9 + vpdpwssd m2, m4, m10 + vpdpwssd m1, m4, m11 + vpdpwssd m3, m4, m12 + add v_weightsq, 1*4 + vpermt2b m0, m14, m2 + vpermt2b m1, m14, m3 + pavgw m0, m15 + pavgw m1, m15 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + sub hd, 1*2 + jg .w64_loop + RET + +cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 + lea r6, [pal_pred_16bpc_avx512icl_table] + tzcnt wd, wm + mova m2, [pal_pred_perm] + movsxd wq, [r6+wq*4] + mova xm3, [palq] + movifnidn hd, hm + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.w4: + pmovzxbw ym0, [idxq] + add idxq, 16 + vpermw ym0, ym0, ym3 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + pmovzxbw m0, [idxq] + add idxq, 32 + vpermw m0, m0, m3 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + vpermb m1, m2, [idxq] + add idxq, 64 + vpermw m0, m1, m3 + psrlw m1, 8 + vpermw m1, m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + dec hd + jg .w64 + RET + +; The ipred_filter SIMD processes 4x2 blocks in the following order which +; increases parallelism compared to doing things row by row. +; w4 w8 w16 w32 +; 1 1 2 1 2 5 6 1 2 5 6 9 a d e +; 2 2 3 2 3 6 7 2 3 6 7 a b e f +; 3 3 4 3 4 7 8 3 4 7 8 b c f g +; 4 4 5 4 5 8 9 4 5 8 9 c d g h + +cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top +%define base r6-$$ + lea r6, [$$] +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + movifnidn hd, hm + movu xm0, [tlq-6] + pmovsxbw m7, [base+filter_intra_taps+filterq+32*0] + pmovsxbw m8, [base+filter_intra_taps+filterq+32*1] + mov r5d, r8m ; bitdepth_max + movsldup m9, [base+filter_permA] + movshdup m10, [base+filter_permA] + shr r5d, 11 ; is_12bpc + jnz .12bpc + psllw m7, 2 ; upshift multipliers so that packusdw + psllw m8, 2 ; will perform clipping for free +.12bpc: + vpbroadcastd m5, [base+filter_rnd+r5*8] + vpbroadcastd m6, [base+filter_shift+r5*8] + sub wd, 8 + jl .w4 +.w8: + call .main4 + movsldup m11, [filter_permB] + lea r5d, [hq*2+2] + movshdup m12, [filter_permB] + lea topq, [tlq+2] + mova m13, [filter_permC] + sub hd, 4 + vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1 + sub tlq, r5 +%if WIN64 + push r7 + push r8 +%endif + mov r7, dstq + mov r8d, hd +.w8_loop: + movlps xm4, xm0, [tlq+hq*2] + call .main8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jge .w8_loop + test wd, wd + jz .end + mov r2d, 0x0d + kmovb k1, r2d + lea r2, [strideq*3] +.w16: + movd xmm0, [r7+strideq*1+12] + vpblendd xmm0, [topq+8], 0x0e ; t1 t2 + pinsrw xm4, xmm0, [r7+strideq*0+14], 2 + call .main8 + add r7, 16 + vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3 + mov hd, r8d + mov dstq, r7 + add topq, 16 +.w16_loop: + movd xmm1, [dstq+strideq*2-4] + punpcklwd xm4, xmm1, xmm0 + movd xmm0, [dstq+r2-4] + shufps xm4{k1}, xmm0, xm0, q3210 + call .main8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jge .w16_loop + sub wd, 8 + jg .w16 +.end: + vpermb m2, m11, m0 + mova ym1, ym5 + vpdpwssd m1, m2, m7 + vpermb m2, m12, m0 + vpdpwssd m1, m2, m8 +%if WIN64 + pop r8 + pop r7 +%endif + vextracti32x8 ym2, m1, 1 + paddd ym1, ym2 + packusdw ym1, ym1 + vpsrlvw ym1, ym6 + vpermt2q m0, m13, m1 + vextracti32x4 [dstq+strideq*0], m0, 2 + vextracti32x4 [dstq+strideq*1], ym0, 1 + RET +.w4_loop: + movlps xm0, [tlq-10] + lea dstq, [dstq+strideq*2] + sub tlq, 4 +.w4: + call .main4 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.main4: + vpermb m2, m9, m0 + mova ym1, ym5 + vpdpwssd m1, m2, m7 + vpermb m0, m10, m0 + vpdpwssd m1, m0, m8 + vextracti32x8 ym0, m1, 1 + paddd ym0, ym1 + vextracti32x4 xm1, ym0, 1 + packusdw xm0, xm1 ; clip + vpsrlvw xm0, xm6 + ret +ALIGN function_align +.main8: + vpermb m3, m11, m0 + mova ym2, ym5 + vpdpwssd m2, m3, m7 + vpermb m3, m9, m4 + mova ym1, ym5 + vpdpwssd m1, m3, m7 + vpermb m3, m12, m0 + vpdpwssd m2, m3, m8 + vpermb m3, m10, m4 + vpdpwssd m1, m3, m8 + vextracti32x8 ym4, m2, 1 + vextracti32x8 ym3, m1, 1 + paddd ym2, ym4 + paddd ym1, ym3 + packusdw ym1, ym2 ; clip + vpsrlvw ym1, ym6 + vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1 + vextracti32x4 [dstq+strideq*0], m0, 2 + vextracti32x4 [dstq+strideq*1], ym0, 1 + ret + +%endif diff --git a/third_party/dav1d/src/x86/ipred16_sse.asm b/third_party/dav1d/src/x86/ipred16_sse.asm new file mode 100644 index 0000000000..07ea9567e1 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred16_sse.asm @@ -0,0 +1,1923 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 +pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 + +pb_0_1: times 4 db 0, 1 +pb_2_3: times 4 db 2, 3 +pw_1: times 4 dw 1 +pw_2: times 4 dw 2 +pw_4: times 4 dw 4 +pw_512: times 4 dw 512 +pw_2048: times 4 dw 2048 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) +%define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) +%define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) + +JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 +JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ + s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 +JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 +JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 +JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 + +cextern smooth_weights_1d_16bpc +cextern smooth_weights_2d_16bpc +cextern filter_intra_taps + +SECTION .text + +INIT_XMM ssse3 +cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h + LEA r5, ipred_dc_left_16bpc_ssse3_table + movd m4, wm + tzcnt wd, wm + add tlq, 2 + movifnidn hd, hm + pxor m3, m3 + pavgw m4, m3 + movd m5, wd + movu m0, [tlq] + movsxd r6, [r5+wq*4] + add r6, r5 + add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_left_16bpc_ssse3_table + mov hd, hm + movd m4, hm + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + pxor m3, m3 + sub tlq, hq + pavgw m4, m3 + movd m5, r6d + movu m0, [tlq] + movsxd r6, [r5+r6*4] + add r6, r5 + add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu m2, [tlq+112] + movu m1, [tlq+ 96] + paddw m0, m2 + movu m2, [tlq+ 80] + paddw m1, m2 + movu m2, [tlq+ 64] + paddw m0, m2 + paddw m0, m1 +.h32: + movu m1, [tlq+ 48] + movu m2, [tlq+ 32] + paddw m1, m2 + paddw m0, m1 +.h16: + movu m1, [tlq+ 16] + paddw m0, m1 +.h8: + movhlps m1, m0 + paddw m0, m1 +.h4: + punpcklwd m0, m3 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + lea stride3q, [strideq*3] + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd m4, r5d + tzcnt r5d, r5d + movd m5, r5d + LEA r5, ipred_dc_16bpc_ssse3_table + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pxor m3, m3 + psrlw m4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movq m0, [tlq-8] + jmp wq +.w4: + movq m1, [tlq+2] + paddw m1, m0 + punpckhwd m0, m3 + punpcklwd m1, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw m0, 3 + jmp .w4_end +.w4_mul: + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 16 + cmove r2d, r3d + psrld m0, 2 + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w4_end: + pshuflw m0, m0, q0000 +.s4: + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +.h8: + mova m0, [tlq-16] + jmp wq +.w8: + movu m1, [tlq+2] + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 8 + je .w8_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 32 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +.h16: + mova m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w16: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + paddw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 16 + je .w16_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + test hd, 8|32 + cmovz r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16c: + mova m1, m0 +.s16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*2+16*0], m0 + mova [dstq+strideq*2+16*1], m1 + mova [dstq+stride3q +16*0], m0 + mova [dstq+stride3q +16*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-48] + paddw m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w32: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + movu m2, [tlq+34] + paddw m0, m2 + movu m2, [tlq+50] + paddw m1, m2 + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 32 + je .w32_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 8 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32c: + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s32: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + mova [dstq+strideq*0+16*2], m2 + mova [dstq+strideq*0+16*3], m3 + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*3], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s32 + RET +.h64: + mova m0, [tlq-128] + mova m1, [tlq-112] + paddw m0, [tlq- 96] + paddw m1, [tlq- 80] + paddw m0, [tlq- 64] + paddw m1, [tlq- 48] + paddw m0, [tlq- 32] + paddw m1, [tlq- 16] + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 2] + movu m2, [tlq+ 18] + paddw m1, m2 + movu m2, [tlq+ 34] + paddw m0, m2 + movu m2, [tlq+ 50] + paddw m1, m2 + movu m2, [tlq+ 66] + paddw m0, m2 + movu m2, [tlq+ 82] + paddw m1, m2 + movu m2, [tlq+ 98] + paddw m0, m2 + movu m2, [tlq+114] + paddw m1, m2 + paddw m0, m1 + punpcklwd m1, m0, m3 + punpckhwd m0, m3 + paddd m0, m1 + paddd m4, m0 + punpckhqdq m0, m0 + paddd m0, m4 + pshuflw m1, m0, q1032 + paddd m0, m1 + psrld m0, m5 + cmp hd, 64 + je .w64_end + mov r2d, 0xAAAB + mov r3d, 0x6667 + cmp hd, 16 + cmove r2d, r3d + movd m1, r2d + pmulhuw m0, m1 + psrlw m0, 1 +.w64_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m0 + mova [dstq+16*2], m0 + mova [dstq+16*3], m0 + mova [dstq+16*4], m0 + mova [dstq+16*5], m0 + mova [dstq+16*6], m0 + mova [dstq+16*7], m0 + add dstq, strideq + dec hd + jg .s64 + RET + +cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + mov r6d, r8m + LEA r5, ipred_dc_128_16bpc_ssse3_table + tzcnt wd, wm + shr r6d, 11 + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_splat_16bpc_ssse3_table + movifnidn hd, hm + movu m0, [tlq+ 2] + movu m1, [tlq+ 18] + movu m2, [tlq+ 34] + movu m3, [tlq+ 50] + cmp wd, 64 + je .w64 + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w64: + WIN64_SPILL_XMM 8 + movu m4, [tlq+ 66] + movu m5, [tlq+ 82] + movu m6, [tlq+ 98] + movu m7, [tlq+114] +.w64_loop: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + mova [dstq+16*4], m4 + mova [dstq+16*5], m5 + mova [dstq+16*6], m6 + mova [dstq+16*7], m7 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 +%define base r5-ipred_h_16bpc_ssse3_table + tzcnt wd, wm + LEA r5, ipred_h_16bpc_ssse3_table + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m2, [base+pb_0_1] + movddup m3, [base+pb_2_3] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w4: + sub tlq, 8 + movq m3, [tlq] + pshuflw m0, m3, q3333 + pshuflw m1, m3, q2222 + pshuflw m2, m3, q1111 + pshuflw m3, m3, q0000 + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m1 + movq [dstq+strideq*2], m2 + movq [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + sub tlq, 8 + movq m3, [tlq] + punpcklwd m3, m3 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + sub tlq, 4 + movd m1, [tlq] + pshufb m0, m1, m3 + pshufb m1, m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m0 + mova [dstq+strideq*1+16*0], m1 + mova [dstq+strideq*1+16*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16 + RET +.w32: + sub tlq, 4 + movd m1, [tlq] + pshufb m0, m1, m3 + pshufb m1, m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m0 + mova [dstq+strideq*0+16*2], m0 + mova [dstq+strideq*0+16*3], m0 + mova [dstq+strideq*1+16*0], m1 + mova [dstq+strideq*1+16*1], m1 + mova [dstq+strideq*1+16*2], m1 + mova [dstq+strideq*1+16*3], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +.w64: + sub tlq, 2 + movd m0, [tlq] + pshufb m0, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m0 + mova [dstq+16*2], m0 + mova [dstq+16*3], m0 + mova [dstq+16*4], m0 + mova [dstq+16*5], m0 + mova [dstq+16*6], m0 + mova [dstq+16*7], m0 + add dstq, strideq + dec hd + jg .w64 + RET + +cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left +%define base r5-ipred_paeth_16bpc_ssse3_table + movifnidn hd, hm + pshuflw m4, [tlq], q0000 + mov leftq, tlq + add hd, hd + punpcklqdq m4, m4 ; topleft + sub leftq, hq + and wd, ~7 + jnz .w8 + movddup m5, [tlq+2] ; top + psubw m6, m5, m4 + pabsw m7, m6 +.w4_loop: + movd m1, [leftq+hq-4] + punpcklwd m1, m1 + punpckldq m1, m1 ; left +%macro PAETH 0 + paddw m0, m6, m1 + psubw m2, m4, m0 ; tldiff + psubw m0, m5 ; tdiff + pabsw m2, m2 + pabsw m0, m0 + pminsw m2, m0 + pcmpeqw m0, m2 + pand m3, m5, m0 + pandn m0, m4 + por m0, m3 + pcmpgtw m3, m7, m2 + pand m0, m3 + pandn m3, m1 + por m0, m3 +%endmacro + PAETH + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2*2 + jg .w4_loop + RET +.w8: +%if ARCH_X86_32 + PUSH r6 + %define r7d hm + %assign regs_used 7 +%elif WIN64 + movaps r4m, m8 + PUSH r7 + %assign regs_used 8 +%endif +%if ARCH_X86_64 + movddup m8, [pb_0_1] +%endif + lea tlq, [tlq+wq*2+2] + neg wq + mov r7d, hd +.w8_loop0: + movu m5, [tlq+wq*2] + mov r6, dstq + add dstq, 16 + psubw m6, m5, m4 + pabsw m7, m6 +.w8_loop: + movd m1, [leftq+hq-2] +%if ARCH_X86_64 + pshufb m1, m8 +%else + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 +%endif + PAETH + mova [r6], m0 + add r6, strideq + sub hd, 1*2 + jg .w8_loop + mov hd, r7d + add wq, 8 + jl .w8_loop0 +%if WIN64 + movaps m8, r4m +%endif + RET + +%if ARCH_X86_64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 4 +%endif + +cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights + LEA weightsq, smooth_weights_1d_16bpc + mov hd, hm + lea weightsq, [weightsq+hq*4] + neg hq + movd m5, [tlq+hq*2] ; bottom + pshuflw m5, m5, q0000 + punpcklqdq m5, m5 + cmp wd, 4 + jne .w8 + movddup m4, [tlq+2] ; top + lea r3, [strideq*3] + psubw m4, m5 ; top - bottom +.w4_loop: + movq m1, [weightsq+hq*2] + punpcklwd m1, m1 + pshufd m0, m1, q1100 + punpckhdq m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w4_loop + RET +.w8: +%if ARCH_X86_32 + PUSH r6 + %assign regs_used 7 + mov hm, hq + %define hq hm +%elif WIN64 + PUSH r7 + %assign regs_used 8 +%endif +.w8_loop0: + mov t0, hq + movu m4, [tlq+2] + add tlq, 16 + mov r6, dstq + add dstq, 16 + psubw m4, m5 +.w8_loop: + movq m3, [weightsq+t0*2] + punpcklwd m3, m3 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [r6+strideq*0], m0 + mova [r6+strideq*1], m1 + lea r6, [r6+strideq*2] + mova [r6+strideq*0], m2 + mova [r6+strideq*1], m3 + lea r6, [r6+strideq*2] + add t0, 4 + jl .w8_loop + sub wd, 8 + jg .w8_loop0 + RET + +cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights + LEA weightsq, smooth_weights_1d_16bpc + mov wd, wm + movifnidn hd, hm + movd m5, [tlq+wq*2] ; right + sub tlq, 8 + add hd, hd + pshuflw m5, m5, q0000 + sub tlq, hq + punpcklqdq m5, m5 + cmp wd, 4 + jne .w8 + movddup m4, [weightsq+4*2] + lea r3, [strideq*3] +.w4_loop: + movq m1, [tlq+hq] ; left + punpcklwd m1, m1 + psubw m1, m5 ; left - right + pshufd m0, m1, q3322 + punpckldq m1, m1 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + paddw m0, m5 + paddw m1, m5 + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movhps [dstq+strideq*2], m1 + movq [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4*2 + jg .w4_loop + RET +.w8: + lea weightsq, [weightsq+wq*4] + neg wq +%if ARCH_X86_32 + PUSH r6 + %assign regs_used 7 + %define hd hm +%elif WIN64 + PUSH r7 + %assign regs_used 8 +%endif +.w8_loop0: + mov t0d, hd + mova m4, [weightsq+wq*2] + mov r6, dstq + add dstq, 16 +.w8_loop: + movq m3, [tlq+t0*(1+ARCH_X86_32)] + punpcklwd m3, m3 + psubw m3, m5 + pshufd m0, m3, q3333 + pshufd m1, m3, q2222 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + REPX {paddw x, m5}, m0, m1, m2, m3 + mova [r6+strideq*0], m0 + mova [r6+strideq*1], m1 + lea r6, [r6+strideq*2] + mova [r6+strideq*0], m2 + mova [r6+strideq*1], m3 + lea r6, [r6+strideq*2] + sub t0d, 4*(1+ARCH_X86_64) + jg .w8_loop + add wq, 8 + jl .w8_loop0 + RET + +%if ARCH_X86_64 +DECLARE_REG_TMP 10 +%else +DECLARE_REG_TMP 3 +%endif + +cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ + h_weights, v_weights, top + LEA h_weightsq, smooth_weights_2d_16bpc + mov wd, wm + mov hd, hm + movd m7, [tlq+wq*2] ; right + lea v_weightsq, [h_weightsq+hq*8] + neg hq + movd m6, [tlq+hq*2] ; bottom + pshuflw m7, m7, q0000 + pshuflw m6, m6, q0000 + cmp wd, 4 + jne .w8 + movq m4, [tlq+2] ; top + mova m5, [h_weightsq+4*4] + punpcklwd m4, m6 ; top, bottom + pxor m6, m6 +.w4_loop: + movq m1, [v_weightsq+hq*4] + sub tlq, 4 + movd m3, [tlq] ; left + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + pmaddwd m0, m4 + punpcklwd m3, m7 ; left, right + pmaddwd m1, m4 + pshufd m2, m3, q1111 + pshufd m3, m3, q0000 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pavgw m0, m6 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w4_loop + RET +.w8: +%if ARCH_X86_32 + lea h_weightsq, [h_weightsq+wq*4] + mov t0, tlq + mov r1m, tlq + mov r2m, hq + %define m8 [h_weightsq+16*0] + %define m9 [h_weightsq+16*1] +%else +%if WIN64 + movaps r4m, m8 + movaps r6m, m9 + PUSH r7 + PUSH r8 +%endif + PUSH r9 + PUSH r10 + %assign regs_used 11 + lea h_weightsq, [h_weightsq+wq*8] + lea topq, [tlq+wq*2] + neg wq + mov r8, tlq + mov r9, hq +%endif + punpcklqdq m6, m6 +.w8_loop0: +%if ARCH_X86_32 + movu m5, [t0+2] + add t0, 16 + mov r0m, t0 +%else + movu m5, [topq+wq*2+2] + mova m8, [h_weightsq+wq*4+16*0] + mova m9, [h_weightsq+wq*4+16*1] +%endif + mov t0, dstq + add dstq, 16 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 +.w8_loop: + movd m1, [v_weightsq+hq*4] + sub tlq, 2 + movd m3, [tlq] ; left + pshufd m1, m1, q0000 + pmaddwd m0, m4, m1 + pshuflw m3, m3, q0000 + pmaddwd m1, m5 + punpcklwd m3, m7 ; left, right + pmaddwd m2, m8, m3 + pmaddwd m3, m9 + paddd m0, m2 + paddd m1, m3 + psrld m0, 8 + psrld m1, 8 + packssdw m0, m1 + pxor m1, m1 + pavgw m0, m1 + mova [t0], m0 + add t0, strideq + inc hq + jl .w8_loop +%if ARCH_X86_32 + mov t0, r0m + mov tlq, r1m + add h_weightsq, 16*2 + mov hq, r2m + sub dword wm, 8 + jg .w8_loop0 +%else + mov tlq, r8 + mov hq, r9 + add wq, 8 + jl .w8_loop0 +%endif +%if WIN64 + movaps m8, r4m + movaps m9, r6m +%endif + RET + +%if ARCH_X86_64 +cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter +%else +cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter +%define m8 [esp+16*0] +%define m9 [esp+16*1] +%define m10 [esp+16*2] +%define m11 [esp+16*3] +%define m12 [esp+16*4] +%define m13 [esp+16*5] +%define m14 [esp+16*6] +%define m15 [esp+16*7] +%endif +%define base r6-$$ + movifnidn hd, hm + movd m6, r8m ; bitdepth_max +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + LEA r6, $$ + shl filterd, 6 + movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 + mova m1, [base+filter_intra_taps+filterq+16*0] + mova m2, [base+filter_intra_taps+filterq+16*1] + mova m3, [base+filter_intra_taps+filterq+16*2] + mova m4, [base+filter_intra_taps+filterq+16*3] + pxor m5, m5 +%if ARCH_X86_64 + punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper + punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid + punpcklbw m10, m5, m2 ; having to perform sign-extension. + punpckhbw m11, m5, m2 + punpcklbw m12, m5, m3 + punpckhbw m13, m5, m3 + punpcklbw m14, m5, m4 + punpckhbw m15, m5, m4 +%else + punpcklbw m7, m5, m1 + mova m8, m7 + punpckhbw m7, m5, m1 + mova m9, m7 + punpcklbw m7, m5, m2 + mova m10, m7 + punpckhbw m7, m5, m2 + mova m11, m7 + punpcklbw m7, m5, m3 + mova m12, m7 + punpckhbw m7, m5, m3 + mova m13, m7 + punpcklbw m7, m5, m4 + mova m14, m7 + punpckhbw m7, m5, m4 + mova m15, m7 +%endif + mova m7, [base+filter_shuf] + add hd, hd + mov r5, dstq + pshuflw m6, m6, q0000 + mov r6, tlq + punpcklqdq m6, m6 + sub tlq, hq +.left_loop: + pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ + pshufd m1, m0, q0000 + pmaddwd m2, m8, m1 + pmaddwd m1, m9 + pshufd m4, m0, q1111 + pmaddwd m3, m10, m4 + pmaddwd m4, m11 + paddd m2, m3 + paddd m1, m4 + pshufd m4, m0, q2222 + pmaddwd m3, m12, m4 + pmaddwd m4, m13 + paddd m2, m3 + paddd m1, m4 + pshufd m3, m0, q3333 + pmaddwd m0, m14, m3 + pmaddwd m3, m15 + paddd m0, m2 + paddd m1, m3 + psrad m0, 11 ; x >> 3 + psrad m1, 11 + packssdw m0, m1 + pmaxsw m0, m5 + pavgw m0, m5 ; (x + 8) >> 4 + pminsw m0, m6 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movlps m0, [tlq+hq-10] + lea dstq, [dstq+strideq*2] + sub hd, 2*2 + jg .left_loop + sub wd, 4 + jz .end + sub tld, r6d ; -h*2 + sub r6, r5 ; tl-dst +.right_loop0: + add r5, 8 + mov hd, tld + movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ + mov dstq, r5 +.right_loop: + pshufd m2, m0, q0000 + pmaddwd m1, m8, m2 + pmaddwd m2, m9 + pshufd m4, m0, q1111 + pmaddwd m3, m10, m4 + pmaddwd m4, m11 + pinsrw m0, [dstq+strideq*0-2], 5 + paddd m1, m3 + paddd m2, m4 + pshufd m0, m0, q2222 + movddup m4, [dstq+strideq*1-8] + pmaddwd m3, m12, m0 + pmaddwd m0, m13 + paddd m1, m3 + paddd m0, m2 + pshuflw m2, m4, q3333 + punpcklwd m2, m5 + pmaddwd m3, m14, m2 + pmaddwd m2, m15 + paddd m1, m3 + paddd m0, m2 + psrad m1, 11 + psrad m0, 11 + packssdw m0, m1 + pmaxsw m0, m5 + pavgw m0, m5 + pminsw m0, m6 + movhps [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + palignr m0, m4, 14 + lea dstq, [dstq+strideq*2] + add hd, 2*2 + jl .right_loop + sub wd, 4 + jg .right_loop0 +.end: + RET + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac + LEA t0, ipred_cfl_left_16bpc_ssse3_table + movd m4, wd + tzcnt wd, wd + movifnidn hd, hm + add tlq, 2 + movsxd r6, [t0+wq*4] + movd m5, wd + jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) + +cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + LEA t0, ipred_cfl_left_16bpc_ssse3_table + tzcnt wd, wm + lea r6d, [hq*2] + movd m4, hd + sub tlq, r6 + tzcnt r6d, hd + movd m5, r6d + movsxd r6, [t0+r6*4] +.start: + movd m7, r7m + movu m0, [tlq] + add r6, t0 + add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table + movsxd wq, [t0+wq*4] + pxor m6, m6 + pshuflw m7, m7, q0000 + pcmpeqw m3, m3 + add wq, t0 + movifnidn acq, acmp + pavgw m4, m6 + punpcklqdq m7, m7 + jmp r6 +.h32: + movu m1, [tlq+48] + movu m2, [tlq+32] + paddw m0, m1 + paddw m0, m2 +.h16: + movu m1, [tlq+16] + paddw m0, m1 +.h8: + pshufd m1, m0, q1032 + paddw m0, m1 +.h4: + pmaddwd m0, m3 + psubd m4, m0 + pshuflw m0, m4, q1032 + paddd m0, m4 + psrld m0, m5 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +%macro IPRED_CFL 2 ; dst, src + pabsw m%1, m%2 + pmulhrsw m%1, m2 + psignw m%2, m1 + psignw m%1, m%2 + paddw m%1, m0 + pmaxsw m%1, m6 + pminsw m%1, m7 +%endmacro + +cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd m4, t0d + tzcnt t0d, t0d + movd m5, t0d + LEA t0, ipred_cfl_16bpc_ssse3_table + tzcnt wd, wd + movd m7, r7m + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+4*4] + psrlw m4, 1 + pxor m6, m6 + pshuflw m7, m7, q0000 + add r6, t0 + add wq, t0 + movifnidn acq, acmp + pcmpeqw m3, m3 + punpcklqdq m7, m7 + jmp r6 +.h4: + movq m0, [tlq-8] + jmp wq +.w4: + movq m1, [tlq+2] + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + cmp hd, 4 + jg .w4_mul + psrld m0, 3 + jmp .w4_end +.w4_mul: + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 16 + cmove r6d, r2d + movd m1, r6d + psrld m0, 2 + pmulhuw m0, m1 + psrlw m0, 1 +.w4_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s4: + movd m1, alpham + lea r6, [strideq*3] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + movq [dstq+strideq*0], m3 + movhps [dstq+strideq*1], m3 + movq [dstq+strideq*2], m4 + movhps [dstq+r6 ], m4 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4_loop + RET +.h8: + mova m0, [tlq-16] + jmp wq +.w8: + movu m1, [tlq+2] + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmove r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+strideq*0], m3 + mova [dstq+strideq*1], m4 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s8_loop + RET +.h16: + mova m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w16: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hd, 8|32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + add acq, 16*2 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*0], m3 + mova [dstq+16*1], m4 + add dstq, strideq + dec hd + jg .s16_loop + RET +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-48] + paddw m0, [tlq-32] + paddw m0, [tlq-16] + jmp wq +.w32: + movu m1, [tlq+ 2] + movu m2, [tlq+18] + paddw m1, m2 + movu m2, [tlq+34] + paddw m1, m2 + movu m2, [tlq+50] + paddw m1, m2 + paddw m0, m1 + pmaddwd m0, m3 + psubd m4, m0 + pshufd m0, m4, q1032 + paddd m0, m4 + pshuflw m4, m0, q1032 + paddd m0, m4 + psrld m0, m5 + cmp hd, 32 + je .w32_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 8 + cmove r6d, r2d + movd m1, r6d + pmulhuw m0, m1 + psrlw m0, 1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq+16*0] + mova m5, [acq+16*1] + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*0], m3 + mova [dstq+16*1], m4 + mova m4, [acq+16*2] + mova m5, [acq+16*3] + add acq, 16*4 + IPRED_CFL 3, 4 + IPRED_CFL 4, 5 + mova [dstq+16*2], m3 + mova [dstq+16*3], m4 + add dstq, strideq + dec hd + jg .s32_loop + RET + +cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac + tzcnt wd, wm + LEA t0, ipred_cfl_splat_16bpc_ssse3_table + mov r6d, r7m + movifnidn hd, hm + shr r6d, 11 + movd m7, r7m + movsxd wq, [t0+wq*4] + movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] + pshuflw m7, m7, q0000 + pxor m6, m6 + add wq, t0 + movifnidn acq, acmp + punpcklqdq m7, m7 + jmp wq + +cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h + movifnidn hpadd, hpadm +%if ARCH_X86_32 && PIC + pcmpeqw m5, m5 + pabsw m5, m5 + paddw m5, m5 +%else + movddup m5, [pw_2] +%endif + mov hd, hm + shl hpadd, 2 + pxor m4, m4 + sub hd, hpadd + cmp dword wm, 8 + mov r5, acq + jg .w16 + je .w8 + lea r3, [strideq*3] +.w4_loop: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + pmaddwd m2, m5, [ypxq+strideq*2] + pmaddwd m3, m5, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + paddd m0, m1 + paddd m2, m3 + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .dc + punpckhqdq m0, m0 + pslld m2, 2 +.w4_hpad: + mova [acq+16*0], m0 + paddd m4, m2 + mova [acq+16*1], m0 + add acq, 16*2 + sub hpadd, 4 + jg .w4_hpad + jmp .dc +.w8: +%if ARCH_X86_32 + cmp dword wpadm, 0 +%else + test wpadd, wpadd +%endif + jnz .w8_wpad1 +.w8_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m2, m5, [ypxq+strideq*1+16*0] + pmaddwd m1, m5, [ypxq+strideq*0+16*1] + pmaddwd m3, m5, [ypxq+strideq*1+16*1] + lea ypxq, [ypxq+strideq*2] + paddd m0, m2 + paddd m1, m3 + paddd m2, m0, m1 + packssdw m0, m1 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + dec hd + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz .dc + pslld m2, 2 + mova m1, m0 + jmp .hpad +.w8_wpad1: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + paddd m0, m1 + pshufd m1, m0, q3333 + paddd m2, m0, m1 + packssdw m0, m1 + paddd m4, m2 + mova [acq], m0 + add acq, 16 + dec hd + jg .w8_wpad1 + jmp .w8_hpad +.w16_wpad3: + pshufd m3, m0, q3333 + mova m1, m3 + mova m2, m3 + jmp .w16_wpad_end +.w16_wpad2: + pshufd m1, m3, q3333 + mova m2, m1 + jmp .w16_wpad_end +.w16_wpad1: + pshufd m2, m1, q3333 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + WIN64_SPILL_XMM 7 +.w16_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m6, m5, [ypxq+strideq*1+16*0] + paddd m0, m6 + cmp wpadd, 2 + jg .w16_wpad3 + pmaddwd m3, m5, [ypxq+strideq*0+16*1] + pmaddwd m6, m5, [ypxq+strideq*1+16*1] + paddd m3, m6 + je .w16_wpad2 + pmaddwd m1, m5, [ypxq+strideq*0+16*2] + pmaddwd m6, m5, [ypxq+strideq*1+16*2] + paddd m1, m6 + jp .w16_wpad1 + pmaddwd m2, m5, [ypxq+strideq*0+16*3] + pmaddwd m6, m5, [ypxq+strideq*1+16*3] + paddd m2, m6 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + paddd m6, m0, m3 + packssdw m0, m3 + paddd m6, m1 + mova [acq+16*0], m0 + packssdw m1, m2 + paddd m2, m6 + mova [acq+16*1], m1 + add acq, 16*2 + paddd m4, m2 + dec hd + jg .w16_loop + WIN64_RESTORE_XMM + add hpadd, hpadd + jz .dc + paddd m2, m2 +.hpad: + mova [acq+16*0], m0 + mova [acq+16*1], m1 + paddd m4, m2 + mova [acq+16*2], m0 + mova [acq+16*3], m1 + add acq, 16*4 + sub hpadd, 4 + jg .hpad +.dc: + sub r5, acq ; -w*h*2 + pshufd m2, m4, q1032 + tzcnt r1d, r5d + paddd m2, m4 + sub r1d, 2 + pshufd m4, m2, q2301 + movd m0, r1d + paddd m2, m4 + psrld m2, m0 + pxor m0, m0 + pavgw m2, m0 + packssdw m2, m2 +.dc_loop: + mova m0, [acq+r5+16*0] + mova m1, [acq+r5+16*1] + psubw m0, m2 + psubw m1, m2 + mova [acq+r5+16*0], m0 + mova [acq+r5+16*1], m1 + add r5, 16*2 + jl .dc_loop + RET + +cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h + movifnidn hpadd, hpadm +%if ARCH_X86_32 && PIC + pcmpeqw m5, m5 + pabsw m5, m5 + psllw m5, 2 +%else + movddup m5, [pw_4] +%endif + mov hd, hm + shl hpadd, 2 + pxor m4, m4 + sub hd, hpadd + cmp dword wm, 8 + mov r5, acq + jg .w16 + je .w8 + lea r3, [strideq*3] +.w4_loop: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m3, m5, [ypxq+strideq*1] + pmaddwd m1, m5, [ypxq+strideq*2] + pmaddwd m2, m5, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + paddd m4, m0 + packssdw m0, m3 + paddd m3, m1 + packssdw m1, m2 + paddd m4, m2 + paddd m4, m3 + mova [acq+16*0], m0 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + punpckhqdq m1, m1 + pslld m2, 3 + mova [acq+16*0], m1 + mova [acq+16*1], m1 + paddd m4, m2 + mova [acq+16*2], m1 + mova [acq+16*3], m1 + add acq, 16*4 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w8: +%if ARCH_X86_32 + cmp dword wpadm, 0 +%else + test wpadd, wpadd +%endif + jnz .w8_wpad1 +.w8_loop: + pmaddwd m0, m5, [ypxq+strideq*0+16*0] + pmaddwd m2, m5, [ypxq+strideq*0+16*1] + pmaddwd m1, m5, [ypxq+strideq*1+16*0] + pmaddwd m3, m5, [ypxq+strideq*1+16*1] + lea ypxq, [ypxq+strideq*2] + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + mova [acq+16*0], m0 + paddd m2, m1, m3 + packssdw m1, m3 + paddd m4, m2 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 2 + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + pslld m2, 2 + mova m0, m1 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w8_wpad1: + pmaddwd m0, m5, [ypxq+strideq*0] + pmaddwd m1, m5, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + pshufd m2, m0, q3333 + pshufd m3, m1, q3333 + paddd m4, m0 + packssdw m0, m2 + paddd m4, m2 + paddd m2, m1, m3 + packssdw m1, m3 + paddd m4, m2 + mova [acq+16*0], m0 + mova [acq+16*1], m1 + add acq, 16*2 + sub hd, 2 + jg .w8_wpad1 + jmp .w8_hpad +.w16_wpad3: + pshufd m3, m0, q3333 + mova m1, m3 + mova m2, m3 + jmp .w16_wpad_end +.w16_wpad2: + pshufd m1, m3, q3333 + mova m2, m1 + jmp .w16_wpad_end +.w16_wpad1: + pshufd m2, m1, q3333 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + WIN64_SPILL_XMM 7 +.w16_loop: + pmaddwd m0, m5, [ypxq+16*0] + cmp wpadd, 2 + jg .w16_wpad3 + pmaddwd m3, m5, [ypxq+16*1] + je .w16_wpad2 + pmaddwd m1, m5, [ypxq+16*2] + jp .w16_wpad1 + pmaddwd m2, m5, [ypxq+16*3] +.w16_wpad_end: + add ypxq, strideq + paddd m6, m0, m3 + packssdw m0, m3 + mova [acq+16*0], m0 + paddd m6, m1 + packssdw m1, m2 + paddd m2, m6 + mova [acq+16*1], m1 + add acq, 16*2 + paddd m4, m2 + dec hd + jg .w16_loop + WIN64_RESTORE_XMM + add hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + paddd m2, m2 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad + +cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h +%define base r6-ipred_cfl_ac_444_16bpc_ssse3_table + LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table + tzcnt wd, wm + movifnidn hpadd, hpadm + pxor m4, m4 + movsxd wq, [r6+wq*4] + movddup m5, [base+pw_1] + add wq, r6 + mov hd, hm + shl hpadd, 2 + sub hd, hpadd + jmp wq +.w4: + lea r3, [strideq*3] + mov r5, acq +.w4_loop: + movq m0, [ypxq+strideq*0] + movhps m0, [ypxq+strideq*1] + movq m1, [ypxq+strideq*2] + movhps m1, [ypxq+r3 ] + lea ypxq, [ypxq+strideq*4] + psllw m0, 3 + psllw m1, 3 + mova [acq+16*0], m0 + pmaddwd m0, m5 + mova [acq+16*1], m1 + pmaddwd m2, m5, m1 + add acq, 16*2 + paddd m4, m0 + paddd m4, m2 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + punpckhqdq m1, m1 + mova [acq+16*0], m1 + pslld m2, 2 + mova [acq+16*1], m1 + punpckhqdq m2, m2 + mova [acq+16*2], m1 + paddd m4, m2 + mova [acq+16*3], m1 + add acq, 16*4 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w8: + mov r5, acq +.w8_loop: + mova m0, [ypxq+strideq*0] + mova m1, [ypxq+strideq*1] + lea ypxq, [ypxq+strideq*2] + psllw m0, 3 + psllw m1, 3 + mova [acq+16*0], m0 + pmaddwd m0, m5 + mova [acq+16*1], m1 + pmaddwd m2, m5, m1 + add acq, 16*2 + paddd m4, m0 + paddd m4, m2 + sub hd, 2 + jg .w8_loop +.w8_hpad: + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + pslld m2, 2 + mova m0, m1 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w16_wpad2: + pshufhw m3, m2, q3333 + pshufhw m1, m0, q3333 + punpckhqdq m3, m3 + punpckhqdq m1, m1 + jmp .w16_wpad_end +.w16: + movifnidn wpadd, wpadm + mov r5, acq +.w16_loop: + mova m2, [ypxq+strideq*0+16*0] + mova m0, [ypxq+strideq*1+16*0] + psllw m2, 3 + psllw m0, 3 + test wpadd, wpadd + jnz .w16_wpad2 + mova m3, [ypxq+strideq*0+16*1] + mova m1, [ypxq+strideq*1+16*1] + psllw m3, 3 + psllw m1, 3 +.w16_wpad_end: + lea ypxq, [ypxq+strideq*2] + mova [acq+16*0], m2 + pmaddwd m2, m5 + mova [acq+16*1], m3 + pmaddwd m3, m5 + paddd m4, m2 + pmaddwd m2, m5, m0 + mova [acq+16*2], m0 + paddd m4, m3 + pmaddwd m3, m5, m1 + mova [acq+16*3], m1 + add acq, 16*4 + paddd m2, m3 + paddd m4, m2 + sub hd, 2 + jg .w16_loop + add hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + paddd m2, m2 + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad +.w32_wpad6: + pshufhw m1, m0, q3333 + punpckhqdq m1, m1 + mova m2, m1 + mova m3, m1 + jmp .w32_wpad_end +.w32_wpad4: + pshufhw m2, m1, q3333 + punpckhqdq m2, m2 + mova m3, m2 + jmp .w32_wpad_end +.w32_wpad2: + pshufhw m3, m2, q3333 + punpckhqdq m3, m3 + jmp .w32_wpad_end +.w32: + movifnidn wpadd, wpadm + mov r5, acq + WIN64_SPILL_XMM 8 +.w32_loop: + mova m0, [ypxq+16*0] + psllw m0, 3 + cmp wpadd, 4 + jg .w32_wpad6 + mova m1, [ypxq+16*1] + psllw m1, 3 + je .w32_wpad4 + mova m2, [ypxq+16*2] + psllw m2, 3 + jnp .w32_wpad2 + mova m3, [ypxq+16*3] + psllw m3, 3 +.w32_wpad_end: + add ypxq, strideq + pmaddwd m6, m5, m0 + mova [acq+16*0], m0 + pmaddwd m7, m5, m1 + mova [acq+16*1], m1 + paddd m6, m7 + pmaddwd m7, m5, m2 + mova [acq+16*2], m2 + paddd m6, m7 + pmaddwd m7, m5, m3 + mova [acq+16*3], m3 + add acq, 16*4 + paddd m6, m7 + paddd m4, m6 + dec hd + jg .w32_loop +%if WIN64 + mova m5, m6 + WIN64_RESTORE_XMM + SWAP 5, 6 +%endif + test hpadd, hpadd + jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc +.w32_hpad_loop: + mova [acq+16*0], m0 + mova [acq+16*1], m1 + paddd m4, m6 + mova [acq+16*2], m2 + mova [acq+16*3], m3 + add acq, 16*4 + dec hpadd + jg .w32_hpad_loop + jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc + +cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h +%define base r2-pal_pred_16bpc_ssse3_table +%if ARCH_X86_32 + %define hd r2d +%endif + mova m3, [palq] + LEA r2, pal_pred_16bpc_ssse3_table + tzcnt wd, wm + pshufb m3, [base+pal_pred_shuf] + movsxd wq, [r2+wq*4] + pshufd m4, m3, q1032 + add wq, r2 + movifnidn hd, hm + jmp wq +.w4: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 4 + jg .w4 + RET +.w8: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8 + RET +.w16: + mova m0, [idxq] + add idxq, 16 + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16 + RET +.w32: + mova m0, [idxq+16*0] + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova m2, [idxq+16*1] + add idxq, 16*2 + mova [dstq+16*0], m0 + pshufb m0, m3, m2 + mova [dstq+16*1], m1 + pshufb m1, m4, m2 + punpcklbw m2, m0, m1 + punpckhbw m0, m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m0 + add dstq, strideq + dec hd + jg .w32 + RET +.w64: + mova m0, [idxq+16*0] + pshufb m1, m3, m0 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova m2, [idxq+16*1] + mova [dstq+16*0], m0 + pshufb m0, m3, m2 + mova [dstq+16*1], m1 + pshufb m1, m4, m2 + punpcklbw m2, m0, m1 + punpckhbw m0, m1 + mova m1, [idxq+16*2] + mova [dstq+16*2], m2 + pshufb m2, m3, m1 + mova [dstq+16*3], m0 + pshufb m0, m4, m1 + punpcklbw m1, m2, m0 + punpckhbw m2, m0 + mova m0, [idxq+16*3] + add idxq, 16*4 + mova [dstq+16*4], m1 + pshufb m1, m3, m0 + mova [dstq+16*5], m2 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + add dstq, strideq + dec hd + jg .w64 + RET diff --git a/third_party/dav1d/src/x86/ipred_avx2.asm b/third_party/dav1d/src/x86/ipred_avx2.asm new file mode 100644 index 0000000000..dd188a7f37 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred_avx2.asm @@ -0,0 +1,5387 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +%macro SMOOTH_WEIGHT_TABLE 1-* + %rep %0 + db %1-128, 127-%1 + %rotate 1 + %endrep +%endmacro + +; sm_weights[], but modified to precalculate x and 256-x with offsets to +; enable efficient use of pmaddubsw (which requires signed values) +smooth_weights: SMOOTH_WEIGHT_TABLE \ + 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 +pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 +z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 + db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 + db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 +z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 + db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 + db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line +pb_128: times 4 db 128 ; those are just placed here for alignment. +pb_36_m4: times 2 db 36, -4 +z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 +z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 +z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13 +z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 +z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 +z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 + dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64 +z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64 + dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64 +z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7 + db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5 +; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5 +filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1 + db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1 +filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 +filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1 +pb_127_m127: times 2 db 127, -127 +ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 + db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 +ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 + db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 +pw_64: times 2 dw 64 + +cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1 + times 9 db 7, -1 +cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ; w=8, w_pad=1 as well as second half of previous one +cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5 + times 5 db 6, 7 + ; w=16,w_pad=2 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + times 8 db 14, 15 + ; w=16,w_pad=3 + db 0, 1, 2, 3, 4, 5 + times 13 db 6, 7 +pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +%define pb_0to15 cfl_ac_w16_pad_shuffle +%define pb_1 (ipred_h_shuf+12) +%define pb_2 (ipred_h_shuf+20) +%define pb_3 (ipred_h_shuf+ 4) +%define pb_4 (ipred_h_shuf+24) +%define pb_5 (ipred_h_shuf+ 8) +%define pb_7 (ipred_h_shuf+ 0) +%define pb_8 (z_upsample2 +12) +%define pb_12 (z2_y_shuf_h4+20) +%define pb_14 (z2_y_shuf_h4+ 4) +%define pb_15 (z_filter_s +32) +%define pb_27 (z2_y_shuf_h4+ 8) +%define pb_31 (z2_y_shuf_h4+12) +%define pb_32 (z2_y_shuf_h4+16) +%define pb_90 (z2_y_shuf_h4+ 0) +%define pw_1 (z2_y_shuf_h4+24) +%define pw_8 (z_filter_k +32) + +pw_62: times 2 dw 62 +pw_128: times 2 dw 128 +pw_255: times 2 dw 255 +pw_512: times 2 dw 512 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4) +%define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4) + +JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32 +JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64 +JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 +JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 +JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 +JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32 +JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 + +cextern dr_intra_derivative +cextern filter_intra_taps + +SECTION .text + +INIT_YMM avx2 +cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h + lea r5, [ipred_dc_left_avx2_table] + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + mov r6d, 0x8000 + shrx r6d, r6d, wd + movd xm3, r6d + movsxd r6, [r5+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + mov r5d, 0x8000 + shrx r5d, r5d, r6d + movd xm3, r5d + lea r5, [ipred_dc_left_avx2_table] + movsxd r6, [r5+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu m1, [tlq+32] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h32: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h16: + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 +.h8: + psrlq xm1, xm0, 32 + paddw xm0, xm1 +.h4: + pmaddwd xm0, xm2 + pmulhrsw xm0, xm3 + lea stride3q, [strideq*3] + vpbroadcastb m0, xm0 + mova m1, m0 + jmp wq + +cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd xm4, r5d + tzcnt r5d, r5d + movd xm5, r5d + lea r5, [ipred_dc_avx2_table] + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pcmpeqd m3, m3 + psrlw xm4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movd xm0, [tlq-4] + pmaddubsw xm0, xm3 + jmp wq +.w4: + movd xm1, [tlq+1] + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm1 + pmaddwd xm0, xm3 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + punpckhqdq xm1, xm0, xm0 + lea r2d, [hq*2] + mov r6d, 0x55563334 + paddw xm0, xm1 + shrx r6d, r6d, r2d + psrlq xm1, xm0, 32 + paddw xm0, xm1 + movd xm1, r6d + psrlw xm0, 2 + pmulhuw xm0, xm1 +.w4_end: + vpbroadcastb xm0, xm0 +.s4: + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm0 + movd [dstq+strideq*2], xm0 + movd [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +ALIGN function_align +.h8: + movq xm0, [tlq-8] + pmaddubsw xm0, xm3 + jmp wq +.w8: + movq xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + punpckhqdq xm2, xm0, xm0 + paddw xm0, xm2 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmove r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w8_end: + vpbroadcastb xm0, xm0 +.s8: + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 + movq [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +ALIGN function_align +.h16: + mova xm0, [tlq-16] + pmaddubsw xm0, xm3 + jmp wq +.w16: + movu xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w16_end: + vpbroadcastb xm0, xm0 +.s16: + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + vextracti128 xm1, m0, 1 + psubw xm0, xm4 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x33345556 + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w32_end: + vpbroadcastb m0, xm0 +.s32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +ALIGN function_align +.h64: + mova m0, [tlq-64] + mova m1, [tlq-32] + pmaddubsw m0, m3 + pmaddubsw m1, m3 + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 1] + movu m2, [tlq+33] + pmaddubsw m1, m3 + pmaddubsw m2, m3 + paddw m0, m1 + paddw m0, m2 + vextracti128 xm1, m0, 1 + psubw xm0, xm4 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 64 + je .w64_end + mov r6d, 0x33345556 + shrx r6d, r6d, hd + movd xm1, r6d + pmulhuw xm0, xm1 +.w64_end: + vpbroadcastb m0, xm0 + mova m1, m0 +.s64: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m0 + mova [dstq+strideq*2+32*1], m1 + mova [dstq+stride3q +32*0], m0 + mova [dstq+stride3q +32*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s64 + RET + +cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128] + mova m1, m0 + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_avx2_table] + tzcnt wd, wm + movu m0, [tlq+ 1] + movu m1, [tlq+33] + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +%macro IPRED_H 2 ; w, store_type + vpbroadcastb m0, [tlq-1] + vpbroadcastb m1, [tlq-2] + vpbroadcastb m2, [tlq-3] + sub tlq, 4 + vpbroadcastb m3, [tlq+0] + mov%2 [dstq+strideq*0], m0 + mov%2 [dstq+strideq*1], m1 + mov%2 [dstq+strideq*2], m2 + mov%2 [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w%1 + RET +ALIGN function_align +%endmacro + +INIT_XMM avx2 +cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 + lea r5, [ipred_h_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w4: + IPRED_H 4, d +.w8: + IPRED_H 8, q +.w16: + IPRED_H 16, a +INIT_YMM avx2 +.w32: + IPRED_H 32, a +.w64: + vpbroadcastb m0, [tlq-1] + vpbroadcastb m1, [tlq-2] + vpbroadcastb m2, [tlq-3] + sub tlq, 4 + vpbroadcastb m3, [tlq+0] + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m0 + mova [dstq+strideq*1+32*0], m1 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m2 + mova [dstq+strideq*2+32*1], m2 + mova [dstq+stride3q +32*0], m3 + mova [dstq+stride3q +32*1], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w64 + RET + +%macro PAETH 2 ; top, ldiff + pavgb m1, m%1, m3 ; Calculating tldiff normally requires + pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it + pand m0, m4 ; in 8-bit with some tricks which avoids + psubusb m2, m5, m1 ; having to unpack everything to 16-bit. + psubb m1, m0 + psubusb m1, m5 + por m1, m2 + paddusb m1, m1 + por m1, m0 ; min(tldiff, 255) + psubusb m2, m5, m3 + psubusb m0, m3, m5 + por m2, m0 ; tdiff + pminub m2, m%2 + pcmpeqb m0, m%2, m2 ; ldiff <= tdiff + vpblendvb m0, m%1, m3, m0 + pminub m1, m2 + pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff + vpblendvb m0, m5, m0, m1 +%endmacro + +cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h +%define base r5-ipred_paeth_avx2_table + lea r5, [ipred_paeth_avx2_table] + tzcnt wd, wm + vpbroadcastb m5, [tlq] ; topleft + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m4, [base+pb_1] + add wq, r5 + jmp wq +.w4: + vpbroadcastd m6, [tlq+1] ; top + mova m8, [base+ipred_h_shuf] + lea r3, [strideq*3] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 ; ldiff +.w4_loop: + sub tlq, 8 + vpbroadcastq m3, [tlq] + pshufb m3, m8 ; left + PAETH 6, 7 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r3 ], xm1, 2 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + sub hd, 8 + jg .w4_loop +.ret: + RET +ALIGN function_align +.w8: + vpbroadcastq m6, [tlq+1] + mova m8, [base+ipred_h_shuf] + lea r3, [strideq*3] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w8_loop: + sub tlq, 4 + vpbroadcastd m3, [tlq] + pshufb m3, m8 + PAETH 6, 7 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +ALIGN function_align +.w16: + vbroadcasti128 m6, [tlq+1] + mova xm8, xm4 ; lower half = 1, upper half = 0 + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w16_loop: + sub tlq, 2 + vpbroadcastd m3, [tlq] + pshufb m3, m8 + PAETH 6, 7 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w32_loop: + dec tlq + vpbroadcastb m3, [tlq] + PAETH 6, 7 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + movu m6, [tlq+ 1] + movu m7, [tlq+33] +%if WIN64 + movaps r4m, xmm9 +%endif + psubusb m8, m5, m6 + psubusb m0, m6, m5 + psubusb m9, m5, m7 + psubusb m1, m7, m5 + por m8, m0 + por m9, m1 +.w64_loop: + dec tlq + vpbroadcastb m3, [tlq] + PAETH 6, 8 + mova [dstq+32*0], m0 + PAETH 7, 9 + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w64_loop +%if WIN64 + movaps xmm9, r4m +%endif + RET + +%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] + ; w * a = (w - 128) * a + 128 * a + ; (256 - w) * b = (127 - w) * b + 129 * b + pmaddubsw m0, m%3, m%1 + pmaddubsw m1, m%4, m%2 + paddw m0, m%5 + paddw m1, m%6 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 +%endmacro + +cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights +%define base r6-ipred_smooth_v_avx2_table + lea r6, [ipred_smooth_v_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m0, [base+pb_127_m127] + vpbroadcastd m1, [base+pw_128] + lea weightsq, [base+smooth_weights+hq*4] + neg hq + vpbroadcastb m5, [tlq+hq] ; bottom + add wq, r6 + jmp wq +.w4: + vpbroadcastd m2, [tlq+1] + punpcklbw m2, m5 ; top, bottom + mova m5, [base+ipred_v_shuf] + lea r3, [strideq*3] + punpckldq m4, m5, m5 + punpckhdq m5, m5 + pmaddubsw m3, m2, m0 + paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok + paddw m3, m1 ; 128 * top + 129 * bottom + 128 +.w4_loop: + vbroadcasti128 m1, [weightsq+hq*2] + pshufb m0, m1, m4 + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 1 + pextrd [dstq+r3 ], xm1, 1 + cmp hd, -4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm1, 2 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + add hq, 8 + jl .w4_loop +.ret: + RET +ALIGN function_align +.w8: + vpbroadcastq m2, [tlq+1] + punpcklbw m2, m5 + mova m5, [base+ipred_v_shuf] + lea r3, [strideq*3] + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + pmaddubsw m3, m2, m0 + paddw m1, m2 + paddw m3, m1 +.w8_loop: + vpbroadcastq m1, [weightsq+hq*2] + pshufb m0, m1, m4 + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +ALIGN function_align +.w16: + WIN64_SPILL_XMM 7 + vbroadcasti128 m3, [tlq+1] + mova m6, [base+ipred_v_shuf] + punpcklbw m2, m3, m5 + punpckhbw m3, m5 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w16_loop: + vpbroadcastd m1, [weightsq+hq*2] + pshufb m1, m6 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w16_loop + RET +ALIGN function_align +.w32: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 6 + movu m3, [tlq+1] + punpcklbw m2, m3, m5 + punpckhbw m3, m5 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w32_loop: + vpbroadcastw m1, [weightsq+hq*2] + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m0 + add dstq, strideq + inc hq + jl .w32_loop + RET +ALIGN function_align +.w64: + WIN64_SPILL_XMM 11 + movu m4, [tlq+ 1] + movu m8, [tlq+33] + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m7, m8, m5 + punpckhbw m8, m5 + pmaddubsw m5, m3, m0 + pmaddubsw m6, m4, m0 + pmaddubsw m9, m7, m0 + pmaddubsw m10, m8, m0 + paddw m2, m1, m3 + paddw m5, m2 + paddw m2, m1, m4 + paddw m6, m2 + paddw m0, m1, m7 + paddw m9, m0 + paddw m1, m8 + paddw m10, m1 +.w64_loop: + vpbroadcastw m2, [weightsq+hq*2] + SMOOTH 2, 2, 3, 4, 5, 6 + mova [dstq+32*0], m0 + SMOOTH 2, 2, 7, 8, 9, 10 + mova [dstq+32*1], m0 + add dstq, strideq + inc hq + jl .w64_loop + RET + +%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used + %assign stack_offset 0 + %assign stack_size_padded 0 + %assign regs_used %2 + %xdefine rstk rsp + SETUP_STACK_POINTER %1 + %if regs_used != %2 && WIN64 + PUSH r%2 + %endif + ALLOC_STACK %1, %3 +%endmacro + +cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h +%define base r6-ipred_smooth_h_avx2_table + lea r6, [ipred_smooth_h_avx2_table] + mov wd, wm + vpbroadcastb m3, [tlq+wq] ; right + tzcnt wd, wd + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m4, [base+pb_127_m127] + vpbroadcastd m5, [base+pw_128] + add wq, r6 + jmp wq +.w4: + WIN64_SPILL_XMM 8 + vpbroadcastq m6, [base+smooth_weights+4*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 8 + sub tlq, hq + lea r3, [strideq*3] +.w4_loop: + vpbroadcastq m2, [tlq+hq] + pshufb m2, m7 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r3 ], xm1, 2 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + sub hd, 8 + jg .w4_loop +.ret: + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 8 + vbroadcasti128 m6, [base+smooth_weights+8*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 4 + lea r3, [strideq*3] + sub tlq, hq +.w8_loop: + vpbroadcastd m2, [tlq+hq] + pshufb m2, m7 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 + paddw m0, m1 + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +ALIGN function_align +.w16: + SETUP_STACK_FRAME 32*4, 7, 8 + lea r3, [rsp+64*2-4] + call .prep ; only worthwhile for for w16 and above + sub tlq, 2 + vpbroadcastd xm6, [base+pb_1] + mova xm7, [base+ipred_v_shuf+16] + vinserti128 m7, [base+ipred_v_shuf+ 0], 1 + vbroadcasti128 m4, [base+smooth_weights+16*2] + vbroadcasti128 m5, [base+smooth_weights+16*3] +.w16_loop: + vpbroadcastd m1, [tlq+hq] + vpbroadcastd m2, [r3+hq*2] + pshufb m1, m6 + punpcklbw m1, m3 + pshufb m2, m7 + SMOOTH 4, 5, 1, 1, 2, 2 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + SETUP_STACK_FRAME 32*4, 7, 6 + lea r3, [rsp+64*2-2] + call .prep + dec tlq + mova xm4, [base+smooth_weights+16*4] + vinserti128 m4, [base+smooth_weights+16*6], 1 + mova xm5, [base+smooth_weights+16*5] + vinserti128 m5, [base+smooth_weights+16*7], 1 +.w32_loop: + vpbroadcastb m1, [tlq+hq] + punpcklbw m1, m3 + vpbroadcastw m2, [r3+hq*2] + SMOOTH 4, 5, 1, 1, 2, 2 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + SETUP_STACK_FRAME 32*4, 7, 9 + lea r3, [rsp+64*2-2] + call .prep + add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table + dec tlq + mova xm5, [r6-16*7] + vinserti128 m5, [r6-16*5], 1 + mova xm6, [r6-16*6] + vinserti128 m6, [r6-16*4], 1 + mova xm7, [r6-16*3] + vinserti128 m7, [r6-16*1], 1 + mova xm8, [r6-16*2] + vinserti128 m8, [r6-16*0], 1 +.w64_loop: + vpbroadcastb m2, [tlq+hq] + punpcklbw m2, m3 + vpbroadcastw m4, [r3+hq*2] + SMOOTH 5, 6, 2, 2, 4, 4 + mova [dstq+32*0], m0 + SMOOTH 7, 8, 2, 2, 4, 4 + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +ALIGN function_align +.prep: + vpermq m2, [tlq-32*1], q3120 + punpckhbw m1, m2, m3 + punpcklbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m1, m5 ; 1 * left + 256 * right + 128 + paddw m0, m1 ; 128 * left + 129 * right + 128 + pmaddubsw m1, m2, m4 + paddw m2, m5 + paddw m1, m2 + vpermq m2, [tlq-32*2], q3120 + mova [rsp+gprsize+32*3], m0 + mova [rsp+gprsize+32*2], m1 + punpckhbw m1, m2, m3 + punpcklbw m2, m3 + pmaddubsw m0, m1, m4 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m2, m5 + paddw m1, m2 + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*0], m1 + sub r3, hq + sub tlq, hq + sub r3, hq + ret + +%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] + pmaddubsw m0, m%3, m%1 + pmaddubsw m1, m%4, m%2 +%ifnum %5 + paddw m0, m%5 +%else + paddw m0, %5 +%endif +%ifnum %6 + paddw m1, m%6 +%else + paddw m1, %6 +%endif + pavgw m0, m2 + pavgw m1, m3 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 +%endmacro + +cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights +%define base r6-ipred_smooth_avx2_table + lea r6, [ipred_smooth_avx2_table] + mov wd, wm + vpbroadcastb m4, [tlq+wq] ; right + tzcnt wd, wd + mov hd, hm + mov r5, tlq + sub r5, hq + movsxd wq, [r6+wq*4] + vpbroadcastd m5, [base+pb_127_m127] + vpbroadcastb m0, [r5] ; bottom + vpbroadcastd m3, [base+pw_255] + add wq, r6 + lea v_weightsq, [base+smooth_weights+hq*2] + jmp wq +.w4: + WIN64_SPILL_XMM 12 + mova m10, [base+ipred_h_shuf] + vpbroadcastq m11, [base+smooth_weights+4*2] + mova m7, [base+ipred_v_shuf] + vpbroadcastd m8, [tlq+1] + sub tlq, 8 + lea r3, [strideq*3] + sub tlq, hq + punpcklbw m8, m0 ; top, bottom + pshufd m6, m7, q2200 + pshufd m7, m7, q3311 + pmaddubsw m9, m8, m5 + paddw m3, m8 ; 1 * top + 255 * bottom + 255 + paddw m9, m3 ; 128 * top + 129 * bottom + 255 +.w4_loop: + vpbroadcastq m1, [tlq+hq] + pshufb m1, m10 + punpcklbw m0, m1, m4 ; left, right + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 ; 127 * left - 127 * right + pmaddubsw m3, m1, m5 + paddw m2, m0 ; 128 * left + 129 * right + paddw m3, m1 + pmaddubsw m0, m11 + pmaddubsw m1, m11 + paddw m2, m0 + paddw m3, m1 + vbroadcasti128 m1, [v_weightsq] + add v_weightsq, 16 + pshufb m0, m1, m6 + pshufb m1, m7 + SMOOTH_2D_END 0, 1, 8, 8, 9, 9 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r3 ], xm1, 2 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+r3 ], xm1, 3 + lea dstq, [dstq+strideq*4] + sub hd, 8 + jg .w4_loop +.ret: + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 + mova m10, [base+ipred_h_shuf] + vbroadcasti128 m11, [base+smooth_weights+8*2] + mova m7, [base+ipred_v_shuf] + vpbroadcastq m8, [tlq+1] + sub tlq, 4 + lea r3, [strideq*3] + sub tlq, hq + punpcklbw m8, m0 + pshufd m6, m7, q0000 + pshufd m7, m7, q1111 + pmaddubsw m9, m8, m5 + paddw m3, m8 + paddw m9, m3 +.w8_loop: + vpbroadcastd m1, [tlq+hq] + pshufb m1, m10 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 + pmaddubsw m3, m1, m5 + paddw m2, m0 + paddw m3, m1 + pmaddubsw m0, m11 + pmaddubsw m1, m11 + paddw m2, m0 + paddw m3, m1 + vpbroadcastq m1, [v_weightsq] + add v_weightsq, 8 + pshufb m0, m1, m6 + pshufb m1, m7 + SMOOTH_2D_END 0, 1, 8, 8, 9, 9 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+r3 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +ALIGN function_align +.w16: + SETUP_STACK_FRAME 32*4, 7, 14 + vbroadcasti128 m11, [tlq+1] + lea r3, [rsp+64*2-4] + punpcklbw m10, m11, m0 ; top, bottom + punpckhbw m11, m0 + call .prep_v + sub tlq, 2 + pmaddubsw m12, m10, m5 + pmaddubsw m13, m11, m5 + vpbroadcastd xm5, [base+pb_1] + mova m9, [base+ipred_v_shuf] + vbroadcasti128 m6, [base+smooth_weights+16*2] + vbroadcasti128 m7, [base+smooth_weights+16*3] + vperm2i128 m8, m9, m9, 0x01 + paddw m0, m10, m3 + paddw m3, m11 + paddw m12, m0 + paddw m13, m3 +.w16_loop: + vpbroadcastd m3, [tlq+hq] + vpbroadcastd m0, [r3+hq*2] + vpbroadcastd m1, [v_weightsq] + add v_weightsq, 4 + pshufb m3, m5 + punpcklbw m3, m4 ; left, right + pmaddubsw m2, m3, m6 + pmaddubsw m3, m7 + pshufb m0, m8 + pshufb m1, m9 + paddw m2, m0 + paddw m3, m0 + SMOOTH_2D_END 1, 1, 10, 11, 12, 13 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + SETUP_STACK_FRAME 32*4, 7, 11 + movu m8, [tlq+1] + lea r3, [rsp+64*2-2] + punpcklbw m7, m8, m0 + punpckhbw m8, m0 + call .prep_v + dec tlq + pmaddubsw m9, m7, m5 + pmaddubsw m10, m8, m5 + mova xm5, [base+smooth_weights+16*4] + vinserti128 m5, [base+smooth_weights+16*6], 1 + mova xm6, [base+smooth_weights+16*5] + vinserti128 m6, [base+smooth_weights+16*7], 1 + paddw m0, m7, m3 + paddw m3, m8 + paddw m9, m0 + paddw m10, m3 +.w32_loop: + vpbroadcastb m3, [tlq+hq] + punpcklbw m3, m4 + vpbroadcastw m0, [r3+hq*2] + vpbroadcastw m1, [v_weightsq] + add v_weightsq, 2 + pmaddubsw m2, m3, m5 + pmaddubsw m3, m6 + paddw m2, m0 + paddw m3, m0 + SMOOTH_2D_END 1, 1, 7, 8, 9, 10 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + SETUP_STACK_FRAME 32*8, 7, 16 + movu m13, [tlq+1 ] + movu m15, [tlq+33] + add r6, smooth_weights+16*15-ipred_smooth_avx2_table + lea r3, [rsp+64*2-2] + punpcklbw m12, m13, m0 + punpckhbw m13, m0 + punpcklbw m14, m15, m0 + punpckhbw m15, m0 + call .prep_v + dec tlq + pmaddubsw m0, m12, m5 + pmaddubsw m1, m13, m5 + pmaddubsw m2, m14, m5 + pmaddubsw m5, m15, m5 + mova xm8, [r6-16*7] + vinserti128 m8, [r6-16*5], 1 + mova xm9, [r6-16*6] + vinserti128 m9, [r6-16*4], 1 + mova xm10, [r6-16*3] + vinserti128 m10, [r6-16*1], 1 + mova xm11, [r6-16*2] + vinserti128 m11, [r6-16*0], 1 + lea r6, [rsp+32*4] + paddw m0, m3 + paddw m1, m3 + paddw m2, m3 + paddw m3, m5 + paddw m0, m12 + paddw m1, m13 + paddw m2, m14 + paddw m3, m15 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 +.w64_loop: + vpbroadcastb m5, [tlq+hq] + punpcklbw m5, m4 + vpbroadcastw m6, [r3+hq*2] + vpbroadcastw m7, [v_weightsq] + add v_weightsq, 2 + pmaddubsw m2, m5, m8 + pmaddubsw m3, m5, m9 + paddw m2, m6 + paddw m3, m6 + SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1] + mova [dstq+32*0], m0 + pmaddubsw m2, m5, m10 + pmaddubsw m3, m5, m11 + paddw m2, m6 + paddw m3, m6 + SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3] + mova [dstq+32*1], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +ALIGN function_align +.prep_v: + vpermq m2, [tlq-32*1], q3120 + punpckhbw m1, m2, m4 + punpcklbw m2, m4 + pmaddubsw m0, m1, m5 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m2, m5 + paddw m1, m2 + vpermq m2, [tlq-32*2], q3120 + mova [rsp+gprsize+32*3], m0 + mova [rsp+gprsize+32*2], m1 + punpckhbw m1, m2, m4 + punpcklbw m2, m4 + pmaddubsw m0, m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m5 + paddw m1, m2 + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*0], m1 + sub r3, hq + sub tlq, hq + sub r3, hq + ret + +cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z1_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea r7, [dr_intra_derivative] + inc tlq + movsxd wq, [r6+wq*4] + add wq, r6 + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [r7+dxq] + xor angled, 0x4ff ; d = 90 - angle + vpbroadcastd m3, [pw_512] + vpbroadcastd m4, [pw_62] + vpbroadcastd m5, [pw_64] + jmp wq +.w4: + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + ALLOC_STACK -32, 8 + mova xm1, [tlq-1] + pshufb xm0, xm1, [z_upsample1] + pshufb xm1, [z_upsample2] + vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse + add dxd, dxd ; pw_512 (which is already in m3) + pmaddubsw xm0, xm2 ; for rounding instead of pw_2048 + pextrd [rsp+16], xm1, 3 ; top[max_base_x] + pmaddubsw xm1, xm2 + movd xm7, dxd + mov r3d, dxd ; xpos + vpbroadcastw m7, xm7 + paddw xm1, xm0 + movq xm0, [tlq] + pmulhrsw xm1, xm3 + pslldq m6, m7, 8 + paddw xm2, xm7, xm7 + lea r2, [strideq*3] + paddw m6, m7 + packuswb xm1, xm1 + paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1 + punpcklbw xm0, xm1 + psllw m7, 2 + mova [rsp], xm0 +.w4_upsample_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + vpbroadcastq m1, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vpbroadcastq m2, [rsp+r5] + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + movq xm0, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + movhps xm0, [rsp+r5] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 ; frac + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 ; 64-frac + psllw m2, 8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + paddw m6, m7 ; xpos += dx + pmulhrsw m0, m3 + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r2 ], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; w4/w8/w16 + ; The C version uses a lot of branches, but we can do all the comparisons + ; in parallel and use popcnt to get the final filter strength value. +%define base r3-z_filter_t0 + lea r3, [z_filter_t0] + movd xm0, maxbased + movd xm2, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m2, xm2 + pcmpeqb m1, m0, [base+z_filter_wh] + pand m1, m2 + mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases + pcmpgtb m1, m2 + pmovmskb r5d, m1 + ret +.w4_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -16, 11 + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + lea maxbased, [hq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .w4_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m7, [base+pb_8] + vbroadcasti128 m2, [tlq-1] + pminub m1, m7, [base+z_filter_s] + vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] + pminub m7, [base+z_filter_s+8] + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] + pshufb m0, m2, m1 + shufps m1, m7, q2121 + pmaddubsw m0, m8 + pshufb m1, m2, m1 + pmaddubsw m1, m9 + pshufb m2, m7 + pmaddubsw m2, m10 + paddw m0, m1 + paddw m0, m2 + pmulhrsw m0, m3 + mov r3d, 9 + mov tlq, rsp + cmp hd, 4 + cmovne maxbased, r3d + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + mova [tlq], xm0 +.w4_main: + movd xm6, dxd + vpbroadcastq m0, [z_base_inc] ; base_inc << 6 + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m6, xm6 + mov r3d, dxd ; xpos + movd xm9, maxbased + vpbroadcastw m9, xm9 + vbroadcasti128 m8, [z1_shuf_w4] + psrlw m7, 8 ; top[max_base_x] + paddw m10, m6, m6 + psubw m9, m0 ; max_base_x + vpblendd m6, m10, 0xcc + mova xm0, xm10 + paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1 + paddw m10, m10 +.w4_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + vpbroadcastq m1, [tlq+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vpbroadcastq m2, [tlq+r5] + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + movq xm0, [tlq+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + movhps xm0, [tlq+r5] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 ; frac + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 ; 64-frac + psllw m2, 8 + pshufb m0, m8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + pcmpgtw m1, m9, m6 ; base < max_base_x + pmulhrsw m0, m3 + paddw m6, m10 ; xpos += dx + lea r5, [dstq+strideq*2] + vpblendvb m0, m7, m0, m1 + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [r5 +strideq*0], xm0 + pextrd [r5 +strideq*1], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*4] + cmp r3d, maxbased + jb .w4_loop + packuswb xm7, xm7 + lea r6, [strideq*3] +.w4_end_loop: + movd [dstq+strideq*0], xm7 + movd [dstq+strideq*1], xm7 + movd [dstq+strideq*2], xm7 + movd [dstq+r6 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_end_loop +.w4_end: + RET +ALIGN function_align +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 8 + movu xm2, [z_filter_s+6] + mova xm0, [tlq-1] + movd xm6, hd + vinserti128 m0, [tlq+7], 1 + vpbroadcastb xm6, xm6 + vbroadcasti128 m1, [z_upsample1] + pminub xm6, xm2 + vpbroadcastd m7, [pb_36_m4] + vinserti128 m2, xm6, 1 + add dxd, dxd + pshufb m1, m0, m1 + pshufb m2, m0, m2 + movd xm6, dxd + pmaddubsw m1, m7 + pmaddubsw m2, m7 + vpbroadcastw m6, xm6 + mov r3d, dxd + psrldq m0, 1 + lea r2, [strideq*3] + paddw m7, m6, m6 + paddw m1, m2 + vpblendd m6, m7, 0xf0 + pmulhrsw m1, m3 + pslldq m2, m7, 8 + paddw m7, m7 + paddw m6, m2 + packuswb m1, m1 + punpcklbw m0, m1 + mova [rsp], m0 +.w8_upsample_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + movu xm0, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vinserti128 m0, [rsp+r5], 1 + lea r5d, [r3+dxq] + shr r3d, 6 ; base2 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + punpcklqdq m1, m2, m2 ; frac0 frac1 + pmaddubsw m0, m1 + movu xm1, [rsp+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base3 + vinserti128 m1, [rsp+r5], 1 + punpckhqdq m2, m2 ; frac2 frac3 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + paddw m6, m7 + pmulhrsw m1, m3 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*2], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+r2 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_upsample_loop + RET +.w8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(h+7, 15) + jmp .w8_main +.w8_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 10 + lea maxbased, [hq+7] + test angled, 0x400 + jnz .w8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w8_main ; filter_strength == 0 + popcnt r5d, r5d + movu xm2, [tlq] + pminub xm1, xm0, [base+z_filter_s+14] + vinserti128 m2, [tlq-1], 1 + vinserti128 m1, [base+z_filter_s+ 0], 1 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] + pminub xm0, [base+z_filter_s+22] + vinserti128 m0, [base+z_filter_s+ 8], 1 + pshufb m6, m2, m1 + pmaddubsw m6, m7 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] + movzx r3d, byte [tlq+15] + shufps m1, m0, q2121 + pshufb m1, m2, m1 + pmaddubsw m1, m7 + paddw m1, m6 + sub r5d, 3 + jnz .w8_3tap + ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one, + ; which also results in an awkward edge case where out[w*2] is + ; slightly different from out[max_base_x] when h > w. + vpbroadcastd m7, [z_filter_k+4*8] + movzx r2d, byte [tlq+14] + pshufb m2, m0 + pmaddubsw m2, m7 + sub r2d, r3d + lea r2d, [r2+r3*8+4] + shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3 + mov [rsp+16], r2b + paddw m1, m2 +.w8_3tap: + pmulhrsw m1, m3 + sar r5d, 1 + mov tlq, rsp + add r5d, 17 ; w*2 + (filter_strength == 3) + cmp hd, 16 + cmovns maxbased, r5d + mov [tlq+r5], r3b + vextracti128 xm0, m1, 1 + packuswb xm0, xm1 + mova [tlq], xm0 +.w8_main: + movd xm2, dxd + vbroadcasti128 m0, [z_base_inc] + vpbroadcastw m2, xm2 + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + movd xm9, maxbased + vbroadcasti128 m8, [z_filter_s+2] + vpbroadcastw m9, xm9 + psrlw m7, 8 + psubw m9, m0 + mov r3d, dxd + paddw m6, m2, m2 + vpblendd m2, m6, 0xf0 +.w8_loop: + lea r5d, [r3+dxq] + shr r3d, 6 + pand m0, m4, m2 + psubw m1, m5, m0 + psllw m0, 8 + por m1, m0 + movu xm0, [tlq+r3] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vinserti128 m0, [tlq+r5], 1 + pshufb m0, m8 + pmaddubsw m0, m1 + pcmpgtw m1, m9, m2 + paddw m2, m6 + pmulhrsw m0, m3 + vpblendvb m0, m7, m0, m1 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + sub hd, 2 + jz .w8_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w8_loop + packuswb xm7, xm7 +.w8_end_loop: + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_end_loop +.w8_end: + RET +.w16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(h+15, 31) + jmp .w16_main +ALIGN function_align +.w16: + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 12 + lea maxbased, [hq+15] + test angled, 0x400 + jnz .w16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .w16_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m1, [base+pb_12] + vbroadcasti128 m6, [base+z_filter_s+8] + vinserti128 m2, m6, [base+z_filter_s], 0 + vinserti128 m6, [base+z_filter_s+16], 1 + mova xm10, [tlq-1] + vinserti128 m10, [tlq+3], 1 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] + vbroadcasti128 m7, [base+z_filter_s+14] + vinserti128 m8, m7, [base+z_filter_s+6], 0 + vinserti128 m7, [base+z_filter_s+22], 1 + psubw m0, m1 + movu xm11, [tlq+12] + vinserti128 m11, [tlq+16], 1 + pminub m8, m0 + pminub m7, m0 + pshufb m0, m10, m2 + shufps m2, m6, q2121 + pmaddubsw m0, m9 + pshufb m1, m11, m8 + shufps m8, m7, q2121 + pmaddubsw m1, m9 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + movzx r3d, byte [tlq+31] + pshufb m2, m10, m2 + pmaddubsw m2, m9 + pshufb m8, m11, m8 + pmaddubsw m8, m9 + paddw m0, m2 + paddw m1, m8 + sub r5d, 3 + jnz .w16_3tap + vpbroadcastd m9, [z_filter_k+4*8] + movzx r2d, byte [tlq+30] + pshufb m10, m6 + pmaddubsw m10, m9 + pshufb m11, m7 + pmaddubsw m11, m9 + sub r2d, r3d + lea r2d, [r2+r3*8+4] + shr r2d, 3 + mov [rsp+32], r2b + paddw m0, m10 + paddw m1, m11 +.w16_3tap: + pmulhrsw m0, m3 + pmulhrsw m1, m3 + sar r5d, 1 + mov tlq, rsp + add r5d, 33 + cmp hd, 32 + cmovns maxbased, r5d + mov [tlq+r5], r3b + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [tlq], m0 +.w16_main: + movd xm6, dxd + vbroadcasti128 m0, [z_base_inc] + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m6, xm6 + movd xm9, maxbased + vbroadcasti128 m8, [z_filter_s+2] + vpbroadcastw m9, xm9 + mov r3d, dxd + psubw m9, m0 + paddw m11, m6, m6 + psubw m10, m9, m3 ; 64*8 + vpblendd m6, m11, 0xf0 +.w16_loop: + lea r5d, [r3+dxq] + shr r3d, 6 ; base0 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu xm0, [tlq+r3+0] + movu xm1, [tlq+r3+8] + lea r3d, [r5+dxq] + shr r5d, 6 ; base1 + vinserti128 m0, [tlq+r5+0], 1 + vinserti128 m1, [tlq+r5+8], 1 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w16_end + lea dstq, [dstq+strideq*2] + cmp r3d, maxbased + jb .w16_loop +.w16_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_end_loop +.w16_end: + RET +ALIGN function_align +.w32: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 15 + lea r3d, [hq+31] + mov maxbased, 63 + cmp hd, 32 + cmovs maxbased, r3d + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vbroadcasti128 m0, [pb_0to15] + sub r3d, 29 ; h+2 + movu xm13, [tlq+29] ; 32-39 + movd xm1, r3d + movu xm14, [tlq+37] ; 40-47 + sub r3d, 8 ; h-6 + vinserti128 m14, [tlq+51], 1 ; 56-63 + vpbroadcastb xm1, xm1 + mova xm11, [tlq- 1] ; 0- 7 + vinserti128 m11, [tlq+13], 1 ; 16-23 + movd xm2, r3d + movu xm12, [tlq+ 5] ; 8-15 + vinserti128 m12, [tlq+19], 1 ; 24-31 + pminub xm1, xm0 ; clip 32x8 + mova m7, [z_filter_s+0] + pshufb xm13, xm1 + vpbroadcastd m1, [pb_12] + vpbroadcastb xm2, xm2 + vinserti128 m13, [tlq+43], 1 ; 48-55 + vinserti128 m8, m7, [z_filter_s+4], 1 + vpblendd m2, m1, 0xf0 + vinserti128 m7, [z_filter_s+12], 0 + pminub m2, m0 ; clip 32x16 and 32x(32|64) + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m14, m2 + pshufb m0, m11, m8 + shufps m8, m7, q1021 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m10, m11, m8 + shufps m8, m7, q2121 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m8 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m8 + pmaddubsw m10, m9 + paddw m1, m10 + pshufb m10, m14, m8 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*2] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m7 + pmaddubsw m12, m9 + movzx r3d, byte [tlq+63] + movzx r2d, byte [tlq+62] + paddw m0, m11 + paddw m2, m12 + pshufb m13, m7 + pmaddubsw m13, m9 + pshufb m14, m7 + pmaddubsw m14, m9 + paddw m1, m13 + paddw m6, m14 + sub r2d, r3d + lea r2d, [r2+r3*8+4] ; edge case for 32x64 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + shr r2d, 3 + mov [rsp+64], r2b + mov tlq, rsp + mov [tlq+65], r3b + mov r3d, 65 + cmp hd, 64 + cmove maxbased, r3d + packuswb m0, m2 + packuswb m1, m6 + mova [tlq+ 0], m0 + mova [tlq+32], m1 +.w32_main: + movd xm6, dxd + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m6, xm6 + movd xm9, maxbased + vbroadcasti128 m8, [z_filter_s+2] + vpbroadcastw m9, xm9 + mov r5d, dxd + psubw m9, [z_base_inc] + mova m11, m6 + psubw m10, m9, m3 ; 64*8 +.w32_loop: + mov r3d, r5d + shr r3d, 6 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu m0, [tlq+r3+0] + movu m1, [tlq+r3+8] + add r5d, dxd + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + mova [dstq], m0 + dec hd + jz .w32_end + add dstq, strideq + cmp r5d, maxbased + jb .w32_loop + test hb, 1 + jz .w32_end_loop + mova [dstq], m7 + add dstq, strideq + dec hd + jz .w32_end +.w32_end_loop: + mova [dstq+strideq*0], m7 + mova [dstq+strideq*1], m7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_end_loop +.w32_end: + RET +ALIGN function_align +.w64: + %assign stack_offset org_stack_offset + ALLOC_STACK -128, 16 + lea maxbased, [hq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + mova xm11, [tlq- 1] ; 0- 7 + vinserti128 m11, [tlq+13], 1 ; 16-23 + movu xm12, [tlq+ 5] ; 8-15 + vinserti128 m12, [tlq+19], 1 ; 24-31 + mova m7, [z_filter_s+0] + vinserti128 m8, m7, [z_filter_s+4], 1 + vinserti128 m7, [z_filter_s+12], 0 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + movu xm13, [tlq+29] ; 32-39 + vinserti128 m13, [tlq+43], 1 ; 48-55 + movu xm14, [tlq+37] ; 40-47 + vinserti128 m14, [tlq+51], 1 ; 56-63 + pshufb m0, m11, m8 + shufps m8, m7, q1021 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m10, m11, m8 + shufps m15, m8, m7, q2121 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m15 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m15 + pmaddubsw m10, m9 + paddw m1, m10 + pshufb m10, m14, m15 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m10, [z_filter_k+4*2+12*2] + pshufb m11, m15 + pmaddubsw m11, m10 + pshufb m12, m7 + pmaddubsw m12, m10 + pshufb m13, m7 + pmaddubsw m13, m10 + pshufb m14, m7 + pmaddubsw m14, m10 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + movu xm11, [tlq+ 61] ; 64- 71 + vinserti128 m11, [tlq+ 75], 1 ; 80- 87 + movu xm12, [tlq+ 69] ; 72- 79 + vinserti128 m12, [tlq+ 83], 1 ; 88- 95 + movu xm13, [tlq+ 93] ; 96-103 + vinserti128 m13, [tlq+107], 1 ; 112-119 + movu xm14, [tlq+101] ; 104-111 + vinserti128 m14, [tlq+115], 1 ; 120-127 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + lea r3d, [hq-20] + mov tlq, rsp + packuswb m0, m2 + packuswb m1, m6 + vpbroadcastd xm2, [pb_14] + vbroadcasti128 m6, [pb_0to15] + mova [tlq+32*0], m0 + mova [tlq+32*1], m1 + movd xm0, r3d + vpbroadcastd m1, [pb_12] + vpbroadcastb m0, xm0 + paddb m0, m2 + pminub m0, m6 ; clip 64x16 and 64x32 + pshufb m12, m0 + pminub m1, m6 ; clip 64x64 + pshufb m14, m1 + pshufb m0, m11, m7 + pmaddubsw m0, m10 + pshufb m2, m12, m7 + pmaddubsw m2, m10 + pshufb m1, m13, m7 + pmaddubsw m1, m10 + pshufb m6, m14, m7 + pmaddubsw m6, m10 + pshufb m7, m11, m15 + pmaddubsw m7, m9 + pshufb m10, m12, m15 + pmaddubsw m10, m9 + paddw m0, m7 + pshufb m7, m13, m15 + pmaddubsw m7, m9 + paddw m2, m10 + pshufb m10, m14, m15 + pmaddubsw m10, m9 + paddw m1, m7 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m8 + pmaddubsw m12, m9 + pshufb m13, m8 + pmaddubsw m13, m9 + pshufb m14, m8 + pmaddubsw m14, m9 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + packuswb m0, m2 + packuswb m1, m6 + mova [tlq+32*2], m0 + mova [tlq+32*3], m1 +.w64_main: + movd xm12, dxd + vpbroadcastb m7, [tlq+maxbaseq] + lea r3d, [dxq-64] + shl maxbased, 6 + vpbroadcastw m12, xm12 + sub r3d, maxbased + vbroadcasti128 m8, [z_filter_s+2] + movd xm6, r3d + mov r5d, dxd + mova m10, [pb_1to32] + vpbroadcastd m11, [pb_32] + vpbroadcastw m6, xm6 +.w64_loop: + mov r3d, r5d + shr r3d, 6 + movu m0, [tlq+r3+ 0] + movu m1, [tlq+r3+ 8] + pand m2, m4, m6 + psubw m9, m5, m2 + psllw m2, 8 + por m9, m2 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + psraw m2, m6, 6 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packsswb m2, m2 + paddb m2, m10 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [dstq+ 0], m0 + movu m0, [tlq+r3+32] + movu m1, [tlq+r3+40] + add r5d, dxd + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + paddb m2, m11 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + paddw m6, m12 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [dstq+32], m0 + dec hd + jz .w64_end + add dstq, strideq + cmp r5d, maxbased + jb .w64_loop +.w64_end_loop: + mova [dstq+ 0], m7 + mova [dstq+32], m7 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + +cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy +%define base r9-z_filter_t0 + lea r9, [ipred_z2_avx2_table] + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + lea dxq, [dr_intra_derivative-90] + movsxd wq, [r9+wq*4] + movzx dyd, angleb + xor angled, 0x400 + mov r8, dxq + sub dxq, dyq + add wq, r9 + add r9, z_filter_t0-ipred_z2_avx2_table + mova m2, [tlq-64] + mova m0, [tlq-32] + mova m1, [tlq] + and dyd, ~1 + and dxq, ~1 + movzx dyd, word [r8+dyq] ; angle - 90 + movzx dxd, word [dxq+270] ; 180 - angle + vpbroadcastd m13, [base+pw_512] + vpbroadcastd m14, [base+pw_62] + vpbroadcastd m15, [base+pw_64] + mova [rsp+ 0], m2 + mova [rsp+32], m0 + mova [rsp+64], m1 + neg dxd + neg dyd + jmp wq +.w4: + vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6 + vbroadcasti128 m10, [base+z1_shuf_w4] + vbroadcasti128 m11, [base+z2_shuf_h4] + lea r2d, [dxq+(65<<6)] ; xpos + movd xm5, dyd + mov r8d, (63-4)<<6 + mov dyq, -4 + pshuflw xm5, xm5, q0000 + pmullw xm5, [base+z2_ymul] + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+2] + add angled, 1022 + shl r3d, 6 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm3, [base+pb_4] + call .upsample_above + sub angled, 1075 ; angle - 53 + lea r3d, [hq+3] + xor angled, 0x7f ; 180 - angle + call .filter_strength + jmp .w4_filter_left +ALIGN function_align +.filter_strength: + movd xm8, r3d + mov r3d, angled + movd xm7, angled + vpbroadcastb m8, xm8 + shr r3d, 8 ; is_sm << 1 + vpbroadcastb m7, xm7 + pcmpeqb m8, [base+z_filter_wh] + mova xm9, [r9+r3*8] + pand m0, m8, m7 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + ret +ALIGN function_align +.upsample_above: ; w4/w8 + pshufb xm2, xm1, [base+z_upsample1-2] + pminub xm3, [base+z_filter_s+4] + vpbroadcastd xm4, [base+pb_36_m4] + vbroadcasti128 m10, [base+pb_0to15] + pshufb xm3, xm1, xm3 + pmaddubsw xm2, xm4 + pmaddubsw xm3, xm4 + lea r2d, [r2+dxq+(1<<6)] + add dxd, dxd + paddw xm2, xm3 + pmulhrsw xm2, xm13 + sub r8d, 3<<6 + paddw m6, m6 + packuswb xm2, xm2 + punpcklbw xm1, xm2 + mova [rsp+gprsize+64], xm1 + ret +ALIGN function_align +.upsample_left: ; h4/h8 + mov r3d, hd + and r3d, 4 + movd xm2, [rsp+gprsize+64] + movddup xm0, [rsp+gprsize+56] + movd xm1, r3d + palignr xm2, xm0, 1 + vpbroadcastb xm1, xm1 + pshufb xm2, [base+z_filter_s+18] + vpbroadcastd xm3, [base+pb_36_m4] + pmaxub xm1, [base+z_upsample1-2] + pshufb xm1, xm0, xm1 + pmaddubsw xm2, xm3 + pmaddubsw xm1, xm3 + paddw xm5, xm5 + add dyq, dyq + paddw xm1, xm2 + pmulhrsw xm1, xm13 + vbroadcasti128 m11, [base+z2_upsample] + paddw xm5, xm15 + packuswb xm1, xm1 + punpcklbw xm0, xm1 + mova [rsp+gprsize+48], xm0 + ret +.w4_no_upsample_above: + lea r3d, [hq+3] + sub angled, 1112 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w4_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm2, [base+pb_4] + pminub xm2, [base+z_filter_s] + vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + pshufb xm3, xm1, xm2 ; 00 01 12 23 + pshufd xm2, xm2, q0321 + pmaddubsw xm0, xm3, xm0 + pshufb xm2, xm1, xm2 ; 12 23 34 44 + pmaddubsw xm2, xm4 + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] + punpckhqdq xm3, xm3 ; 34 44 44 44 + pmaddubsw xm3, xm4 + movd xm4, r6m ; max_width + pminsw xm4, xm15 + vpbroadcastb xm4, xm4 + paddw xm0, xm2 + paddw xm0, xm3 + pmulhrsw xm0, xm13 + psubb xm4, [base+pb_1to32] + psrlq xm1, 8 + packuswb xm0, xm0 + vpblendvb xm0, xm1, xm4 + movd [rsp+65], xm0 +.w4_no_filter_above: + lea r3d, [hq+2] + add angled, 973 ; angle + 883 + shl r3d, 6 + test r3d, angled + jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) + vpbroadcastd xm0, [base+pb_90] + psubb xm0, xm7 ; 180 - angle + pand xm0, xm8 ; reuse from previous filter_strength call + pcmpgtb xm0, xm9 + pmovmskb r3d, xm0 +.w4_filter_left: + test r3d, r3d + jz .w4_main + popcnt r3d, r3d + mov r5d, 10 + cmp hd, 16 + movu xm2, [rsp+49] + vinserti128 m2, [rsp+43], 1 + cmovs r5d, hd + xor r5d, 15 ; h == 16 ? 5 : 15 - h + movd xm0, r5d + vbroadcasti128 m1, [base+z_filter_s+12] + vbroadcasti128 m4, [base+z_filter_s+16] + vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab + vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd + vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef + vpbroadcastb m0, xm0 + pmaxub m0, m3 + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] + pshufb m0, m2, m0 + pmaddubsw m0, m3 + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1] + pshufb m1, m2, m1 + pmaddubsw m1, m3 + vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] + pshufb m2, m4 + pmaddubsw m2, m3 + movd xm4, r7m ; max_height + pminsw xm4, xm15 + vpbroadcastb xm4, xm4 + psubb xm4, [base+pb_16to1] + paddw m1, m0 + paddw m1, m2 + pmulhrsw m1, m13 + vextracti128 xm0, m1, 1 + packuswb xm0, xm1 + vpblendvb xm0, [rsp+48], xm4 + mova [rsp+48], xm0 + jmp .w4_main +.w4_upsample_left: + call .upsample_left +.w4_main: + movd xm0, dxd + mova m12, [base+z2_y_shuf_h4] + lea r5, [rsp+56] ; left-7 + vpbroadcastw m0, xm0 + lea r9, [strideq*3] + psraw xm1, xm5, 6 + pand xm5, xm14 ; frac_y + pxor xm2, xm2 + paddw m7, m0, m0 + psubw xm4, xm2, xm1 ; base_y + vpblendd m0, m7, 0xcc + mova xm1, xm7 + punpcklwd xm4, xm2 + paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1 + psubw xm1, xm15, xm5 ; 64-frac_y + psllw xm5, 8 + paddw m7, m7 + paddw m6, m0 + por xm5, xm1 ; 64-frac_y, frac_y + vpbroadcastq m5, xm5 +.w4_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + vpbroadcastq m1, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + vpbroadcastq m2, [rsp+r3] + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + movq xm0, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + movhps xm0, [rsp+r3] + vpblendd m1, m2, 0xc0 + pand m2, m14, m6 ; frac_x + vpblendd m0, m1, 0xf0 + psubw m1, m15, m2 ; 64-frac_x + psllw m2, 8 + pshufb m0, m10 + por m1, m2 ; 64-frac_x, frac_x + pmaddubsw m0, m1 + cmp r3d, 64 + jge .w4_toponly + mova m1, m7 ; arbitrary negative value + vpgatherdq m3, [r5+xm4], m1 + pshufb m1, m3, m11 + vpermd m1, m12, m1 + pmaddubsw m1, m5 + psraw m2, m6, 15 ; base_x < topleft + vpblendvb m0, m1, m2 +.w4_toponly: + pmulhrsw m0, m13 + paddw m6, m7 ; xpos += dx + add r5, dyq + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r9 ], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + sub hd, 4 + jz .w4_end + lea dstq, [dstq+strideq*4] + cmp r2d, r8d + jge .w4_loop +.w4_leftonly_loop: + mova m1, m7 + vpgatherdq m2, [r5+xm4], m1 + add r5, dyq + pshufb m0, m2, m11 + vpermd m0, m12, m0 + pmaddubsw m0, m5 + pmulhrsw m0, m13 + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r9 ], xm0, 1 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_leftonly_loop +.w4_end: + RET +.w8: + vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6 + movd xm5, dyd + vbroadcasti128 m10, [base+z_filter_s+2] + vbroadcasti128 m11, [base+z2_shuf_h4] + lea r2d, [dxq+(65<<6)] ; xpos + vpbroadcastw xm5, xm5 + mov r8d, (63-8)<<6 + mov dyq, -4 + pmullw xm5, [base+z2_ymul] + test angled, 0x400 + jnz .w8_main + lea r3d, [angleq+126] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + vpbroadcastd xm3, [base+pb_8] + movhps [rsp+80], xm1 + call .upsample_above + sub angled, 53 ; angle - 53 + lea r3d, [hq+7] + xor angled, 0x7f ; 180 - angle + call .filter_strength + jmp .w8_filter_left +.w8_no_upsample_above: + lea r3d, [hq+7] + sub angled, 90 ; angle - 90 + call .filter_strength + test r3d, r3d + jz .w8_no_filter_above + popcnt r3d, r3d + vpbroadcastd xm3, [base+pb_8] + pminub xm3, [base+z_filter_s+8] + vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] + pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67 + pmaddubsw xm0, xm2, xm0 + pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88 + shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88 + pmaddubsw xm2, xm4 + vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] + pmaddubsw xm3, xm4 + movd xm4, r6m ; max_width + pminuw xm4, xm15 + vpbroadcastb xm4, xm4 + paddw xm0, xm2 + paddw xm0, xm3 + pmulhrsw xm0, xm13 + psubb xm4, [base+pb_1to32] + psrldq xm1, 1 + packuswb xm0, xm0 + vpblendvb xm0, xm1, xm4 + movq [rsp+65], xm0 +.w8_no_filter_above: + lea r3d, [angleq-51] + mov r3b, hb + cmp r3d, 8 + jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 +.w8_filter_left: + test r3d, r3d + jz .w8_main + popcnt r3d, r3d + vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] + cmp hd, 32 + jne .w8_filter_left_h16 + movu xm2, [rsp+27] + vinserti128 m2, [rsp+35], 1 + vpbroadcastd xm0, [base+pb_5] + vbroadcasti128 m3, [base+z_filter_s+ 8] + vbroadcasti128 m1, [base+z_filter_s+12] + vbroadcasti128 m4, [base+z_filter_s+16] + pmaxub m3, m0 + pshufb m3, m2, m3 + pmaddubsw m3, m7 + pshufb m1, m2, m1 + pmaddubsw m1, m8 + pshufb m2, m4 + pmaddubsw m2, m9 + paddw m3, m1 + paddw m3, m2 + pmulhrsw m3, m13 + jmp .w8_filter_left_top16 +.w8_filter_left_h16: + mov r5d, 10 + cmp hd, 16 + cmovs r5d, hd + xor r5d, 15 ; h == 16 ? 5 : 15 - h + movd xm0, r5d + vpbroadcastb m0, xm0 +.w8_filter_left_top16: + vbroadcasti128 m1, [base+z_filter_s+12] + vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab + vbroadcasti128 m4, [base+z_filter_s+16] + vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd + vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef + pmaxub m0, m2 + movu xm2, [rsp+49] + vinserti128 m2, [rsp+43], 1 + pshufb m0, m2, m0 + pmaddubsw m0, m7 + movd xm7, r7m ; max_height + pshufb m1, m2, m1 + pmaddubsw m1, m8 + pshufb m2, m4 + pmaddubsw m2, m9 + pminsw xm7, xm15 + paddw m1, m0 + vpbroadcastb m7, xm7 + paddw m1, m2 + pmulhrsw m1, m13 + psubb m7, [base+pb_32to1] + packuswb m3, m1 + vpermq m3, m3, q1320 + vpblendvb m3, [rsp+32], m7 + mova [rsp+32], m3 + jmp .w8_main +.w8_upsample_left: + call .upsample_left +.w8_main: + movd xm3, dxd + lea r5, [rsp+56] ; left-7 + pshufd xm1, xm5, q3120 + pand xm5, xm14 + vpbroadcastw m3, xm3 + pxor xm0, xm0 + psubw xm2, xm15, xm5 + psraw xm1, 6 + lea r9, [strideq*3] + paddw m7, m3, m3 + psubw xm9, xm0, xm1 ; base_y + psllw xm5, 8 + punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5 + vpblendd m3, m7, 0xf0 ; xpos0 xpos1 + por xm5, xm2 ; 64-frac_y, frac_y + punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7 + paddw m6, m3 + vinserti128 m12, m5, xm5, 1 +.w8_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm0, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + vinserti128 m0, [rsp+r3], 1 + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + movu xm1, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + vinserti128 m1, [rsp+r3], 1 + pand m2, m14, m6 + paddsw m4, m6, m7 + psubw m5, m15, m2 + psllw m2, 8 + pshufb m0, m10 + por m2, m5 + pmaddubsw m0, m2 + pand m2, m14, m4 + psubw m5, m15, m2 + psllw m2, 8 + pshufb m1, m10 + por m2, m5 + pmaddubsw m1, m2 + cmp r3d, 64 + jge .w8_toponly + mova m5, m7 + vpgatherdq m3, [r5+xm9], m7 + mova m7, m5 + vpgatherdq m2, [r5+xm8], m5 + pshufb m3, m11 + pshufb m2, m11 + punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3 + vpermq m5, m5, q3120 ; y0 y1 + vpermq m2, m2, q3120 ; y2 y3 + pmaddubsw m5, m12 + pmaddubsw m2, m12 + psraw m6, 15 ; base_x < topleft + vpblendvb m0, m5, m6 + psraw m3, m4, 15 + vpblendvb m1, m2, m3 +.w8_toponly: + pmulhrsw m0, m13 + pmulhrsw m1, m13 + paddw m6, m4, m7 ; xpos += dx + add r5, dyq + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*2], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+r9 ], xm1 + sub hd, 4 + jz .w8_end + lea dstq, [dstq+strideq*4] + cmp r2d, r8d + jge .w8_loop +.w8_leftonly_loop: + mova m0, m7 + vpgatherdq m5, [r5+xm9], m7 + mova m7, m0 + vpgatherdq m3, [r5+xm8], m0 + add r5, dyq + pshufb m2, m5, m11 + pshufb m1, m3, m11 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + pmaddubsw m0, m12 + pmaddubsw m1, m12 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*2], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+r9 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_leftonly_loop +.w8_end: + RET +.w16: + mov r8d, hd + test angled, 0x400 + jnz .w16_main + lea r3d, [hq+15] + sub angled, 90 + call .filter_strength + test r3d, r3d + jz .w16_no_filter_above + popcnt r3d, r3d + vbroadcasti128 m6, [tlq+1] + mova xm2, [base+z_filter_s] + vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de + movu xm3, [base+z_filter_s+8] + vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff + vpblendd m1, m6, 0xf0 + vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2] + pshufb m2, m1, m2 + pshufb m1, m3 + pmaddubsw m0, m2, m0 + shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff + pmaddubsw m2, m4 + pmaddubsw m1, m5 + movd xm4, r6m ; max_width + pminsw xm4, xm15 + vpbroadcastb xm4, xm4 + paddw m0, m2 + paddw m0, m1 + pmulhrsw m0, m13 + psubb xm4, [base+pb_1to32] + vextracti128 xm2, m0, 1 + packuswb xm0, xm2 + vpblendvb xm0, xm6, xm4 + movu [rsp+65], xm0 +.w16_no_filter_above: + vpbroadcastd m0, [base+pb_90] + psubb m0, m7 + pand m0, m8 + pcmpgtb m0, m9 + pmovmskb r3d, m0 + test r3d, r3d + jz .w16_main + popcnt r3d, r3d + vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] + vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] + vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] +.w16_filter_left: + movd xm6, r7m ; max_height + pminsw xm6, xm15 + vpbroadcastb m6, xm6 + cmp hd, 32 + jl .w16_filter_left_h16 + vpbroadcastd xm0, [base+pb_5] + vbroadcasti128 m10, [base+z_filter_s+ 8] + vbroadcasti128 m11, [base+z_filter_s+12] + vbroadcasti128 m12, [base+z_filter_s+16] + je .w16_filter_left_h32 + movu m3, [tlq-69] + movu m5, [tlq-61] + pmaxub m1, m10, m0 + pshufb m1, m3, m1 + pmaddubsw m1, m7 + pshufb m2, m3, m11 + pmaddubsw m2, m8 + pshufb m3, m12 + pmaddubsw m3, m9 + paddw m1, m2 + pshufb m2, m5, m10 + pmaddubsw m2, m7 + pshufb m4, m5, m11 + pmaddubsw m4, m8 + pshufb m5, m12 + pmaddubsw m5, m9 + paddw m1, m3 + vpbroadcastd m3, [base+pb_32] + paddb m3, [base+pb_32to1] + paddw m2, m4 + paddw m2, m5 + pmulhrsw m1, m13 + pmulhrsw m2, m13 + psubb m3, m6, m3 + packuswb m1, m2 + vpblendvb m1, [tlq-64], m3 + mova [rsp], m1 + jmp .w16_filter_left_top32 +.w16_filter_left_h32: + pmaxub m10, m0 +.w16_filter_left_top32: + movu xm2, [tlq-37] + vinserti128 m2, [tlq-29], 1 + pshufb m3, m2, m10 + pshufb m1, m2, m11 + pshufb m2, m12 + pmaddubsw m3, m7 + pmaddubsw m1, m8 + pmaddubsw m2, m9 + paddw m3, m1 + paddw m3, m2 + pmulhrsw m3, m13 + jmp .w16_filter_left_top16 +.w16_filter_left_h16: + mov r5d, 10 + cmp hd, 16 + cmovs r5d, hd + xor r5d, 15 ; h == 16 ? 5 : 15 - h + movd xm0, r5d + vpbroadcastb m0, xm0 +.w16_filter_left_top16: + movu xm2, [tlq-15] + vinserti128 m2, [tlq-21], 1 + vbroadcasti128 m1, [base+z_filter_s+12] + vbroadcasti128 m4, [base+z_filter_s+16] + vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab + vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd + vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef + pmaxub m0, m5 + pshufb m0, m2, m0 + pmaddubsw m0, m7 + pshufb m1, m2, m1 + pmaddubsw m1, m8 + pshufb m2, m4 + pmaddubsw m2, m9 + psubb m6, [base+pb_32to1] + paddw m1, m0 + paddw m1, m2 + pmulhrsw m1, m13 + packuswb m3, m1 + vpermq m3, m3, q1320 + vpblendvb m3, [tlq-32], m6 + mova [rsp+32], m3 +.w16_main: + movd xm1, dyd + vbroadcasti128 m10, [base+z_filter_s+2] + movd xm7, dxd + vbroadcasti128 m11, [base+z2_shuf_h2] + vpbroadcastw m1, xm1 + vpbroadcastw m7, xm7 + mov r7, dstq + pmullw m0, m1, [base+z2_ymul] + psllw xm1, 4 + paddw m6, m7, [base+z2_base_inc] + lea r9d, [dxq+(65<<6)] ; xpos + movd [rsp+156], xm1 +.w16_loop0: + mov r2d, r9d + mova [rsp+160], m0 + lea r5, [rsp+60] ; left-3 + mova [rsp+192], m6 + pxor m1, m1 + psraw m2, m0, 6 + pand m0, m14 + psubw m9, m1, m2 ; base_y + psubw m12, m15, m0 + punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11 + psllw m0, 8 + punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15 + por m12, m0 ; 64-frac_y, frac_y +.w16_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu xm0, [rsp+r2] + vinserti128 m0, [rsp+r2+8], 1 + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + movu xm1, [rsp+r3] + vinserti128 m1, [rsp+r3+8], 1 + pand m2, m14, m6 + paddsw m5, m6, m7 + psubw m3, m15, m2 + psllw m2, 8 + pshufb m0, m10 + por m2, m3 + pmaddubsw m0, m2 + pand m2, m14, m5 + psubw m3, m15, m2 + psllw m2, 8 + pshufb m1, m10 + por m2, m3 + pmaddubsw m1, m2 + cmp r3d, 64 + jge .w16_toponly + punpckhwd m2, m5, m5 ; mask out unnecessary loads + vpgatherdd m4, [r5+m9], m2 + punpcklwd m2, m5, m5 + vpgatherdd m3, [r5+m8], m2 + pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1 + pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1 + punpcklqdq m2, m3, m4 ; y0 + punpckhqdq m3, m4 ; y1 + pmaddubsw m2, m12 + pmaddubsw m3, m12 + psraw m6, 15 ; base_x < topleft + vpblendvb m0, m2, m6 + psraw m6, m5, 15 + vpblendvb m1, m3, m6 +.w16_toponly: + pmulhrsw m0, m13 + pmulhrsw m1, m13 + paddw m6, m5, m7 ; xpos += dx + sub r5, 2 + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w16_end + lea dstq, [dstq+strideq*2] + cmp r2d, (63-16)<<6 + jge .w16_loop +.w16_leftonly_loop: + mova m0, m7 + vpgatherdd m4, [r5+m9], m7 + mova m7, m0 + vpgatherdd m3, [r5+m8], m0 + sub r5, 2 + pshufb m2, m4, m11 + pshufb m1, m3, m11 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pmaddubsw m0, m12 + pmaddubsw m1, m12 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_leftonly_loop +.w16_end: + sub r8d, 1<<8 + jl .w16_ret + vpbroadcastd m0, [rsp+156] + paddw m0, [rsp+160] ; base_y += 16*dy + paddw m6, m13, [rsp+192] + add r7, 16 + add r9d, 16<<6 + movzx hd, r8b + mov dstq, r7 + paddw m6, m13 ; base_x += 16*64 + jmp .w16_loop0 +.w16_ret: + RET +.w32: + mova m2, [tlq+32] + lea r8d, [hq+(1<<8)] + mova [rsp+96], m2 + test angled, 0x400 + jnz .w16_main + vpbroadcastd m7, [base+z_filter_k+4*2+12*0] + vpbroadcastd m8, [base+z_filter_k+4*2+12*1] + vpbroadcastd m9, [base+z_filter_k+4*2+12*2] + mova xm5, [base+z_filter_s] + vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc + vinserti128 m1, [tlq+11], 1 + movu xm6, [base+z_filter_s+12] + vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff + movu xm3, [tlq+ 6] + vinserti128 m3, [tlq+17], 1 + movd xm0, r6m ; max_width + pminsw xm0, xm15 + vpbroadcastb m10, xm0 +.w32_filter_above: + pshufb m0, m1, m5 + shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de + pmaddubsw m0, m7 + pshufb m2, m1, m4 + shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff + pmaddubsw m2, m8 + pshufb m1, m5 + pmaddubsw m1, m9 + paddw m0, m2 + paddw m0, m1 + pshufb m1, m3, m4 + pmaddubsw m1, m7 + pshufb m2, m3, m5 + pmaddubsw m2, m8 + pshufb m3, m6 + pmaddubsw m3, m9 + paddw m1, m2 + paddw m1, m3 + pmulhrsw m0, m13 + pmulhrsw m1, m13 + psubb m10, [base+pb_1to32] + packuswb m0, m1 + vpblendvb m0, [tlq+1], m10 + movu [rsp+65], m0 + jmp .w16_filter_left +.w64: + mova m2, [tlq+32] + mov r3d, [tlq+64] + lea r8d, [hq+(3<<8)] + mova [rsp+ 96], m2 + mov [rsp+128], r3d + test angled, 0x400 + jnz .w16_main + vpbroadcastd m7, [base+z_filter_k+4*2+12*0] + vpbroadcastd m8, [base+z_filter_k+4*2+12*1] + vpbroadcastd m9, [base+z_filter_k+4*2+12*2] + movu xm6, [base+z_filter_s+ 4] + vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc + movu xm3, [tlq+30] + vinserti128 m3, [tlq+43], 1 + movu xm5, [base+z_filter_s+16] + vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff + pshufb m0, m3, m6 + shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de + pmaddubsw m0, m7 + pshufb m2, m3, m4 + shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff + pmaddubsw m2, m8 + pshufb m3, m6 + pmaddubsw m3, m9 + paddw m0, m2 + paddw m0, m3 + movu xm2, [tlq+36] + vinserti128 m2, [tlq+49], 1 + pshufb m4, m2, m4 + pmaddubsw m4, m7 + pshufb m3, m2, m6 + pmaddubsw m3, m8 + pshufb m2, m5 + pmaddubsw m2, m9 + movd xm5, r6m ; max_width + pminsw xm5, xm15 + vpbroadcastb m10, xm5 + paddw m3, m4 + paddw m2, m3 + vpbroadcastd m3, [base+pb_32] + pmulhrsw m0, m13 + pmulhrsw m2, m13 + mova xm5, [base+z_filter_s] + vinserti128 m5, [base+z_filter_s+6], 1 + psubb m3, m10, m3 + psubb m3, [base+pb_1to32] + vinserti128 m1, [tlq+13], 1 + packuswb m0, m2 + vpblendvb m0, [tlq+33], m3 + movu xm3, [tlq+ 6] + vinserti128 m3, [tlq+19], 1 + movu [rsp+97], m0 + jmp .w32_filter_above + +cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase + %assign org_stack_offset stack_offset + lea r6, [ipred_z3_avx2_table] + tzcnt hd, hm + movifnidn angled, anglem + lea r7, [dr_intra_derivative+45*2-1] + dec tlq + movsxd hq, [r6+hq*4] + sub angled, 180 + add hq, r6 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + movzx dyd, word [r7+dyq] + vpbroadcastd m3, [pw_512] + vpbroadcastd m4, [pw_62] + vpbroadcastd m5, [pw_64] + mov org_wd, wd + jmp hq +.h4: + lea r7, [strideq*3] + cmp angleb, 40 + jae .h4_no_upsample + lea r4d, [angleq-1024] + sar r4d, 7 + add r4d, wd + jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) + ALLOC_STACK -32, 9 + movu xm8, [tlq-7] + pshufb xm0, xm8, [z_upsample1-4] + vpbroadcastb xm2, xm8 + pshufb xm1, xm8, [z_filter_s+2] + mova [rsp+16], xm2 ; top[max_base_y] + vpbroadcastd xm2, [pb_36_m4] + add dyd, dyd + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm2 + movd xm7, dyd + mov r2d, dyd + vpbroadcastw m7, xm7 + paddw xm1, xm0 + pmulhrsw xm1, xm3 + pslldq m6, m7, 8 + paddw xm2, xm7, xm7 + paddw m6, m7 + packuswb xm1, xm1 + paddw m6, m2 + punpcklbw xm1, xm8 + mova xm8, [z_transpose4] + psllw m7, 2 + pshufb xm1, [pb_15to0] + mova [rsp], xm1 +.h4_upsample_loop: + lea r4d, [r2+dyq] + shr r2d, 6 + vpbroadcastq m1, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 + vpbroadcastq m2, [rsp+r4] + lea r4d, [r2+dyq] + shr r2d, 6 + movq xm0, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 + movhps xm0, [rsp+r4] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 + psllw m2, 8 + por m1, m2 + pmaddubsw m0, m1 + paddw m6, m7 + pmulhrsw m0, m3 + vextracti128 xm1, m0, 1 + packuswb xm1, xm0 + pshufb xm1, xm8 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+r7 ], xm1, 3 + add dstq, 4 + sub wd, 4 + jg .h4_upsample_loop + RET +ALIGN function_align +.filter_strength: ; h4/h8/h16 +%define base r4-z_filter_t0 + lea r4, [z_filter_t0] + movd xm0, maxbased + movd xm2, angled + shr angled, 8 ; is_sm << 1 + vpbroadcastb m0, xm0 + vpbroadcastb m2, xm2 + pcmpeqb m1, m0, [base+z_filter_wh] + pand m1, m2 + mova xm2, [r4+angleq*8] + pcmpgtb m1, m2 + pmovmskb r5d, m1 + ret +.h4_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -16, 12 + mov maxbased, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h4_main + lea maxbased, [wq+3] + call .filter_strength + mov maxbased, 7 + test r5d, r5d + jz .h4_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m7, [base+pb_7] + vbroadcasti128 m2, [tlq-14] + pmaxub m1, m7, [base+z_filter_s-4] + vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] + pmaxub m7, [base+z_filter_s+4] + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] + pshufb m0, m2, m1 + shufps m1, m7, q2121 + pmaddubsw m0, m8 + pshufb m1, m2, m1 + pmaddubsw m1, m9 + pshufb m2, m7 + pmaddubsw m2, m10 + paddw m0, m1 + paddw m0, m2 + pmulhrsw m0, m3 + mov r4d, 9 + lea tlq, [rsp+15] + cmp wd, 4 + cmovne maxbased, r4d + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + mova [rsp], xm0 +.h4_main: + movd xm6, dyd + vpbroadcastq m0, [z_base_inc] ; base_inc << 6 + mov r4, tlq + sub tlq, 4 + neg dyq + vpbroadcastw m6, xm6 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] ; ypos + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf_w4] + add maxbased, 64 + vpbroadcastw m9, xm9 + psrlw m7, 8 ; top[max_base_y] + paddw m10, m6, m6 + psubw m9, m0 ; max_base_y + vpblendd m6, m10, 0xcc + mova xm0, xm10 + paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1 + paddw m10, m10 + mova xm11, [z_transpose4] +.h4_loop: + lea r5, [r4+dyq] + sar r4, 6 ; base0 + vpbroadcastq m1, [tlq+r4] + lea r4, [r5+dyq] + sar r5, 6 ; base1 + vpbroadcastq m2, [tlq+r5] + lea r5, [r4+dyq] + sar r4, 6 ; base2 + movq xm0, [tlq+r4] + lea r4, [r5+dyq] + sar r5, 6 ; base3 + movhps xm0, [tlq+r5] + vpblendd m1, m2, 0xc0 + pand m2, m4, m6 ; frac + vpblendd m0, m1, 0xf0 + psubw m1, m5, m2 ; 64-frac + psllw m2, 8 + pshufb m0, m8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + pcmpgtw m1, m9, m6 ; base < max_base_y + pmulhrsw m0, m3 + paddw m6, m10 ; ypos += dy + vpblendvb m0, m7, m0, m1 + vextracti128 xm1, m0, 1 + packuswb xm1, xm0 + pshufb xm1, xm11 ; transpose + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+r7 ], xm1, 3 + sub wd, 4 + jz .h4_end + add dstq, 4 + cmp r4d, maxbased + jg .h4_loop + packuswb xm7, xm7 +.h4_end_loop: + movd [dstq+strideq*0], xm7 + movd [dstq+strideq*1], xm7 + movd [dstq+strideq*2], xm7 + movd [dstq+r7 ], xm7 + add dstq, 4 + sub wd, 4 + jg .h4_end_loop +.h4_end: + RET +ALIGN function_align +.h8: + lea r4d, [angleq+216] + mov r4b, wb + cmp r4d, 8 + ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 8 + and r4d, 4 + mova xm0, [tlq-15] + vinserti128 m0, [tlq- 9], 1 + movd xm1, r4d + movu xm2, [z_filter_s+2] + vinserti128 m2, [z_filter_s+6], 1 + vpbroadcastb xm1, xm1 ; w & 4 + vpbroadcastd m7, [pb_36_m4] + pmaxub xm1, [z_upsample1-4] ; clip 4x8 + vinserti128 m1, [z_upsample1], 1 + add dyd, dyd + pshufb m1, m0, m1 + pshufb m2, m0, m2 + vinserti128 m0, [tlq-7], 1 + movd xm6, dyd + pmaddubsw m1, m7 + pmaddubsw m2, m7 + vpbroadcastw m6, xm6 + mov r2d, dyd + lea r5, [strideq*3] + paddw m7, m6, m6 + paddw m1, m2 + vpblendd m6, m7, 0xf0 + pmulhrsw m1, m3 + pslldq m2, m7, 8 + paddw m7, m7 + paddw m6, m2 + vbroadcasti128 m2, [pb_15to0] + packuswb m1, m1 + punpcklbw m1, m0 + pshufb m1, m2 + vextracti128 [rsp+ 0], m1, 1 + mova [rsp+16], xm1 +.h8_upsample_loop: + lea r4d, [r2+dyq] + shr r2d, 6 ; base0 + movu xm0, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 ; base1 + vinserti128 m0, [rsp+r4], 1 + lea r4d, [r2+dyq] + shr r2d, 6 ; base2 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + punpcklqdq m1, m2, m2 ; frac0 frac1 + pmaddubsw m0, m1 + movu xm1, [rsp+r2] + lea r2d, [r4+dyq] + shr r4d, 6 ; base3 + vinserti128 m1, [rsp+r4], 1 + punpckhqdq m2, m2 ; frac2 frac3 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + paddw m6, m7 + pmulhrsw m1, m3 + lea r4, [dstq+strideq*4] + psllw m1, 8 + por m0, m1 + vextracti128 xm1, m0, 1 + punpcklbw xm2, xm0, xm1 + punpckhbw xm0, xm1 + movd [dstq+strideq*0], xm2 + pextrd [dstq+strideq*1], xm2, 1 + pextrd [dstq+strideq*2], xm2, 2 + pextrd [dstq+r5 ], xm2, 3 + movd [r4 +strideq*0], xm0 + pextrd [r4 +strideq*1], xm0, 1 + pextrd [r4 +strideq*2], xm0, 2 + pextrd [r4 +r5 ], xm0, 3 + add dstq, 4 + sub wd, 4 + jg .h8_upsample_loop + RET +.h8_no_intra_edge_filter: + and maxbased, 7 + or maxbased, 8 ; imin(w+7, 15) + jmp .h8_main +.h8_no_upsample: + %assign stack_offset org_stack_offset + ALLOC_STACK -32, 10 + lea maxbased, [wq+7] + test angled, 0x400 + jnz .h8_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h8_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd xm6, [base+pb_15] + pcmpeqb xm1, xm1 + psubusb xm6, xm0 + psubb xm6, xm1 ; w == 4 ? 5 : 1 + movu xm2, [tlq-16] + pmaxub xm1, xm6, [base+z_filter_s] + vinserti128 m2, [tlq-14], 1 + vinserti128 m1, [base+z_filter_s+12], 1 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] + pmaxub xm6, [base+z_filter_s+ 8] + vinserti128 m6, [base+z_filter_s+20], 1 + pshufb m0, m2, m1 + pmaddubsw m0, m7 + vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] + movzx r4d, byte [tlq-15] + shufps m1, m6, q2121 + pshufb m1, m2, m1 + pmaddubsw m1, m7 + paddw m0, m1 + sub r5d, 3 + jnz .h8_3tap + vpbroadcastd m7, [z_filter_k+4*8] + movzx r2d, byte [tlq-14] + pshufb m2, m6 + pmaddubsw m2, m7 + sub r2d, r4d + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+15], r2b + paddw m0, m2 +.h8_3tap: + pmulhrsw m0, m3 + sar r5d, 1 + lea tlq, [rsp+31] + add r5d, 17 + cmp wd, 16 + cmovns maxbased, r5d + neg r5 + mov [tlq+r5], r4b + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + mova [tlq-15], xm0 +.h8_main: + movd xm2, dyd + vbroadcasti128 m0, [z_base_inc] + mov r4, tlq + sub tlq, 8 + neg dyq + vpbroadcastw m2, xm2 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf] + add maxbased, 64 + vpbroadcastw m9, xm9 + psrlw m7, 8 + psubw m9, m0 + paddw m6, m2, m2 + vpblendd m2, m6, 0x0f +.h8_loop: + lea r5, [r4+dyq] + sar r4, 6 + pand m0, m4, m2 + psubw m1, m5, m0 + psllw m0, 8 + por m1, m0 + vbroadcasti128 m0, [tlq+r4] + lea r4, [r5+dyq] + sar r5, 6 + vinserti128 m0, [tlq+r5], 0 + sub rsp, 8*2 + pshufb m0, m8 + pmaddubsw m0, m1 + pcmpgtw m1, m9, m2 + paddw m2, m6 + pmulhrsw m0, m3 + vpblendvb m0, m7, m0, m1 + vextracti128 xm1, m0, 1 + psllw xm0, 8 + por xm0, xm1 ; interleave rows (partial transpose) + mova [rsp], xm0 + sub wd, 2 + jz .h8_transpose + cmp r4d, maxbased + jg .h8_loop + packuswb xm0, xm7, xm7 +.h8_end_loop: + sub rsp, 8*2 + mova [rsp], xm0 + sub wd, 2 + jg .h8_end_loop +.h8_transpose: + mova xm2, [rsp+16*1] + sub org_wd, 8 + lea r2, [strideq*3] + lea r6, [dstq+org_wq] + cmovns dstq, r6 + punpcklwd xm1, xm2, xm0 + punpckhwd xm2, xm0 + lea r6, [dstq+strideq*4] + jge .h8_w8 + add rsp, 16*2 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+r2 ], xm1, 3 + movd [r6 +strideq*0], xm2 + pextrd [r6 +strideq*1], xm2, 1 + pextrd [r6 +strideq*2], xm2, 2 + pextrd [r6 +r2 ], xm2, 3 + jmp .h8_end +.h8_w8_loop: + mova xm0, [rsp+16*0] + mova xm2, [rsp+16*1] + punpcklwd xm1, xm2, xm0 + punpckhwd xm2, xm0 +.h8_w8: ; w8/w16/w32 + mova xm0, [rsp+16*2] + mova xm4, [rsp+16*3] + add rsp, 16*4 + punpcklwd xm3, xm4, xm0 + punpckhwd xm4, xm0 + punpckldq xm0, xm3, xm1 + punpckhdq xm3, xm1 + punpckldq xm1, xm4, xm2 + punpckhdq xm4, xm2 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm3 + movhps [dstq+r2 ], xm3 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + movq [r6 +strideq*2], xm4 + movhps [r6 +r2 ], xm4 + sub dstq, 8 + sub r6, 8 + sub org_wd, 8 + jge .h8_w8_loop +.h8_end: + RET +.h16_no_intra_edge_filter: + and maxbased, 15 + or maxbased, 16 ; imin(w+15, 31) + jmp .h16_main +ALIGN function_align +.h16: + %assign stack_offset org_stack_offset + ALLOC_STACK -64, 12 + lea maxbased, [wq+15] + test angled, 0x400 + jnz .h16_no_intra_edge_filter + call .filter_strength + test r5d, r5d + jz .h16_main ; filter_strength == 0 + popcnt r5d, r5d + vpbroadcastd m11, [base+pb_27] + vpbroadcastd m1, [base+pb_1] + vbroadcasti128 m6, [base+z_filter_s+12] + vinserti128 m2, m6, [base+z_filter_s+4], 0 + vinserti128 m6, [base+z_filter_s+20], 1 + movu xm10, [tlq-18] + vinserti128 m10, [tlq-14], 1 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] + vbroadcasti128 m7, [base+z_filter_s+8] + vinserti128 m8, m7, [base+z_filter_s+0], 0 + vinserti128 m7, [base+z_filter_s+16], 1 + psubusb m11, m0 + por m1, m11 + movu xm11, [tlq-32] + vinserti128 m11, [tlq-28], 1 + pmaxub m8, m1 + pmaxub m7, m1 + pshufb m0, m10, m2 + shufps m2, m6, q2121 + pmaddubsw m0, m9 + pshufb m1, m11, m8 + shufps m8, m7, q2121 + pmaddubsw m1, m9 + vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] + movzx r4d, byte [tlq-31] + pshufb m2, m10, m2 + pmaddubsw m2, m9 + pshufb m8, m11, m8 + pmaddubsw m8, m9 + paddw m0, m2 + paddw m1, m8 + sub r5d, 3 + jnz .h16_3tap + vpbroadcastd m9, [z_filter_k+4*8] + movzx r2d, byte [tlq-30] + pshufb m10, m6 + pmaddubsw m10, m9 + pshufb m11, m7 + pmaddubsw m11, m9 + sub r2d, r4d + lea r2d, [r2+r4*8+4] + shr r2d, 3 + mov [rsp+31], r2b + paddw m0, m10 + paddw m1, m11 +.h16_3tap: + pmulhrsw m0, m3 + pmulhrsw m1, m3 + sar r5d, 1 + lea tlq, [rsp+63] + add r5d, 33 + cmp wd, 32 + cmovns maxbased, r5d + neg r5 + mov [tlq+r5], r4b + packuswb m0, m1 + vpermq m0, m0, q2031 + mova [tlq-31], m0 +.h16_main: + movd xm6, dyd + vbroadcasti128 m0, [z_base_inc] + mov r4, tlq + sub tlq, 8 + neg dyq + vpbroadcastw m6, xm6 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf] + add maxbased, 64 + vpbroadcastw m9, xm9 + psubw m9, m0 + paddw m11, m6, m6 + psubw m10, m9, m3 ; 64*8 + vpblendd m6, m11, 0xf0 +.h16_loop: + lea r5, [r4+dyq] + sar r4, 6 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu xm0, [tlq+r4-0] + movu xm1, [tlq+r4-8] + lea r4, [r5+dyq] + sar r5, 6 + vinserti128 m0, [tlq+r5-0], 1 + vinserti128 m1, [tlq+r5-8], 1 + sub rsp, 32 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + vpermq m0, m0, q3120 + mova [rsp], m0 + sub wd, 2 + jz .h16_transpose + cmp r4d, maxbased + jg .h16_loop + mova m0, m7 +.h16_end_loop: + sub rsp, 32 + mova [rsp], m7 + sub wd, 2 + jg .h16_end_loop +.h16_transpose: + mova m2, [rsp+32*1] + sub org_wd, 8 + lea r2, [strideq*3] + lea r6, [dstq+org_wq] + cmovns dstq, r6 + punpcklbw m1, m2, m0 + punpckhbw m2, m0 + lea r3, [strideq*5] + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + lea r4, [strideq+r2*2] ; stride*7 + jge .h16_w8 + add rsp, 32*2 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + vextracti128 xm0, m0, 1 + movd [dstq+strideq*4], xm1 + pextrd [dstq+r3 ], xm1, 1 + pextrd [dstq+r2*2 ], xm1, 2 + pextrd [dstq+r4 ], xm1, 3 + lea dstq, [dstq+strideq*8] + vextracti128 xm1, m1, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + movd [dstq+strideq*4], xm1 + pextrd [dstq+r3 ], xm1, 1 + pextrd [dstq+r2*2 ], xm1, 2 + pextrd [dstq+r4 ], xm1, 3 + jmp .h16_end +.h16_w8_loop: + mova m0, [rsp+32*0] + mova m2, [rsp+32*1] + punpcklbw m1, m2, m0 + punpckhbw m2, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 +.h16_w8: + mova m2, [rsp+32*2] + mova m4, [rsp+32*3] + lea r6, [dstq+strideq*8] + add rsp, 32*4 + punpcklbw m3, m4, m2 + punpckhbw m4, m2 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + punpckldq m4, m2, m0 + punpckhdq m2, m0 + punpckldq m0, m3, m1 + punpckhdq m3, m1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + vextracti128 xm4, m4, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+r2 ], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*4], xm0 + movhps [dstq+r3 ], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+r2*2 ], xm3 + movhps [dstq+r4 ], xm3 + vextracti128 xm3, m3, 1 + movq [r6+strideq*0], xm4 + movhps [r6+strideq*1], xm4 + movq [r6+strideq*2], xm2 + movhps [r6+r2 ], xm2 + movq [r6+strideq*4], xm0 + movhps [r6+r3 ], xm0 + movq [r6+r2*2 ], xm3 + movhps [r6+r4 ], xm3 + sub dstq, 8 + sub org_wd, 8 + jge .h16_w8_loop +.h16_end: + RET +ALIGN function_align +.h32: + %assign stack_offset org_stack_offset + ALLOC_STACK -96, 15 + lea maxbased, [wq+31] + and maxbased, 31 + or maxbased, 32 ; imin(w+31, 63) + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h32_main + vbroadcasti128 m0, [pb_0to15] + mov r4d, 21 + mov r5d, 3 + movu xm11, [tlq-66] ; 56-63 + vinserti128 m11, [tlq-52], 1 ; 40-47 + sub r4d, wd ; 21-w + cmovns r5d, r4d + movu xm12, [tlq-58] ; 48-55 + vinserti128 m12, [tlq-44], 1 ; 32-39 + sub r4d, 8 ; 13-w + movd xm1, r5d + movu xm13, [tlq-34] ; 24-31 + vinserti128 m13, [tlq-20], 1 ; 8-15 + movd xm2, r4d + vpbroadcastb m1, xm1 + movu xm14, [tlq-28] ; 16-23 + vinserti128 m14, [tlq-14], 1 ; 0- 7 + vpbroadcastb m2, xm2 + pmaxsb m1, m0 ; clip 16x32 and (32|64)x32 + movu m7, [z_filter_s+4] + pshufb m11, m1 + vinserti128 m8, m7, [z_filter_s+8], 1 + vinserti128 m7, [z_filter_s+16], 0 + pmaxsb m2, m0 ; clip 8x32 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m12, m2 + pshufb m0, m11, m8 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + shufps m8, m7, q1021 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m10, m11, m8 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m8 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m8 + pmaddubsw m10, m9 + shufps m8, m7, q2121 + paddw m1, m10 + pshufb m10, m14, m8 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*2] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m8 + pmaddubsw m12, m9 + movzx r4d, byte [tlq-63] + movzx r2d, byte [tlq-62] + paddw m0, m11 + paddw m2, m12 + pshufb m13, m8 + pmaddubsw m13, m9 + pshufb m14, m7 + pmaddubsw m14, m9 + paddw m1, m13 + paddw m6, m14 + sub r2d, r4d + lea r2d, [r2+r4*8+4] ; edge case for 64x32 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + shr r2d, 3 + mov [rsp+31], r2b + lea tlq, [rsp+95] + mov [tlq-65], r4b + mov r4d, 65 + cmp wd, 64 + cmove maxbased, r4d + packuswb m0, m2 + packuswb m1, m6 + mova [tlq-63], m0 + mova [tlq-31], m1 +.h32_main: + movd xm6, dyd + mov r4, tlq + sub tlq, 8 + neg dyq + vpbroadcastw m6, xm6 + sub r4, maxbaseq + shl maxbased, 6 + vpbroadcastb m7, [r4] + lea r4, [dyq+63] + movd xm9, maxbased + not maxbased + vbroadcasti128 m8, [z3_shuf] + add maxbased, 64 + vpbroadcastw m9, xm9 + psubw m9, [z_base_inc] + mova m11, m6 + psubw m10, m9, m3 ; 64*8 +.h32_loop: + mov r5, r4 + sar r5, 6 + pand m1, m4, m6 + psubw m2, m5, m1 + psllw m1, 8 + por m2, m1 + movu xm0, [tlq+r5- 0] + vinserti128 m0, [tlq+r5-16], 1 + movu xm1, [tlq+r5- 8] + vinserti128 m1, [tlq+r5-24], 1 + sub rsp, 32 + add r4, dyq + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + pcmpgtw m1, m9, m6 + pcmpgtw m2, m10, m6 + packsswb m1, m2 + paddw m6, m11 + vpblendvb m0, m7, m0, m1 + mova [rsp], m0 + dec wd + jz .h32_transpose + cmp r4d, maxbased + jg .h32_loop +.h32_end_loop: + sub rsp, 32 + mova [rsp], m7 + dec wd + jg .h32_end_loop +.h32_transpose: + lea dstq, [dstq+org_wq-8] + lea r2, [strideq*3] + lea r3, [strideq*5] + lea r4, [strideq+r2*2] ; stride*7 +.h32_w8_loop: + mova m7, [rsp+32*0] + mova m6, [rsp+32*1] + mova m5, [rsp+32*2] + mova m4, [rsp+32*3] + mova m3, [rsp+32*4] + mova m2, [rsp+32*5] + mova m1, [rsp+32*6] + mova m0, [rsp+32*7] + lea r6, [dstq+strideq*8] + add rsp, 32*8 + punpcklbw m8, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklwd m7, m8, m1 + punpckhwd m8, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + punpckldq m2, m8, m3 + punpckhdq m8, m3 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + punpckldq m5, m0, m4 + punpckhdq m0, m4 + movq [dstq+strideq*0], xm6 + movhps [dstq+strideq*1], xm6 + vextracti128 xm6, m6, 1 + movq [dstq+strideq*2], xm7 + movhps [dstq+r2 ], xm7 + vextracti128 xm7, m7, 1 + movq [dstq+strideq*4], xm2 + movhps [dstq+r3 ], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+r2*2 ], xm8 + movhps [dstq+r4 ], xm8 + vextracti128 xm8, m8, 1 + movq [r6+strideq*0], xm3 + movhps [r6+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [r6+strideq*2], xm1 + movhps [r6+r2 ], xm1 + vextracti128 xm1, m1, 1 + movq [r6+strideq*4], xm5 + movhps [r6+r3 ], xm5 + vextracti128 xm5, m5, 1 + movq [r6+r2*2 ], xm0 + movhps [r6+r4 ], xm0 + lea r6, [r6+strideq*8] + vextracti128 xm0, m0, 1 + movq [r6+strideq*0], xm6 + movhps [r6+strideq*1], xm6 + movq [r6+strideq*2], xm7 + movhps [r6+r2 ], xm7 + movq [r6+strideq*4], xm2 + movhps [r6+r3 ], xm2 + movq [r6+r2*2 ], xm8 + movhps [r6+r4 ], xm8 + lea r6, [r6+strideq*8] + movq [r6+strideq*0], xm3 + movhps [r6+strideq*1], xm3 + movq [r6+strideq*2], xm1 + movhps [r6+r2 ], xm1 + movq [r6+strideq*4], xm5 + movhps [r6+r3 ], xm5 + movq [r6+r2*2 ], xm0 + movhps [r6+r4 ], xm0 + sub dstq, 8 + sub org_wd, 8 + jg .h32_w8_loop + RET +ALIGN function_align +.h64: + %assign stack_offset org_stack_offset + ALLOC_STACK -128, 16 + lea maxbased, [wq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h64_main + mov r4d, 21 + vpbroadcastb xm11, [tlq-127] + vpblendd xm11, [tlq-130], 0x0e ; 120-127 + sub r4d, wd ; 21-w + mov r5d, 3 + vinserti128 m11, [tlq-116], 1 ; 104-111 + movu m7, [z_filter_s+4] + cmp wd, 32 + cmove r4d, r5d + vinserti128 m8, m7, [z_filter_s+8], 1 + vbroadcasti128 m6, [pb_0to15] + movd xm1, r4d + vpbroadcastd m9, [z_filter_k+4*2+12*0] + movu xm12, [tlq-122] ; 112-119 + vinserti128 m12, [tlq-108], 1 ; 96-103 + vpbroadcastb m1, xm1 + movu xm13, [tlq- 98] ; 88- 95 + vinserti128 m13, [tlq- 84], 1 ; 72- 79 + movu xm14, [tlq- 90] ; 80- 87 + vinserti128 m14, [tlq- 76], 1 ; 64- 71 + vinserti128 m7, [z_filter_s+16], 0 + pshufb m0, m11, m8 + pmaddubsw m0, m9 + pshufb m2, m12, m8 + pmaddubsw m2, m9 + pmaxsb m1, m6 ; clip (16|32)x64 + pshufb m13, m1 + pshufb m1, m13, m8 + pmaddubsw m1, m9 + pshufb m6, m14, m8 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + shufps m15, m8, m7, q1021 + pshufb m10, m11, m15 + pmaddubsw m10, m9 + paddw m0, m10 + pshufb m10, m12, m15 + pmaddubsw m10, m9 + paddw m2, m10 + pshufb m10, m13, m15 + pmaddubsw m10, m9 + paddw m1, m10 + pshufb m10, m14, m15 + pmaddubsw m10, m9 + paddw m6, m10 + vpbroadcastd m9, [z_filter_k+4*2+12*2] + shufps m10, m8, m7, q2132 + pshufb m11, m10 + pmaddubsw m11, m9 + pshufb m12, m10 + pmaddubsw m12, m9 + pshufb m13, m10 + pmaddubsw m13, m9 + pshufb m14, m10 + pmaddubsw m14, m9 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + movu xm11, [tlq-66] ; 56-63 + vinserti128 m11, [tlq-52], 1 ; 40-47 + movu xm12, [tlq-58] ; 48-55 + vinserti128 m12, [tlq-44], 1 ; 32-39 + movu xm13, [tlq-34] ; 24-31 + vinserti128 m13, [tlq-20], 1 ; 8-15 + movu xm14, [tlq-28] ; 16-23 + vinserti128 m14, [tlq-14], 1 ; 0- 7 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + lea tlq, [rsp+127] + packuswb m0, m2 + packuswb m1, m6 + mova [tlq-127], m0 + mova [tlq- 95], m1 + pshufb m0, m11, m10 + pmaddubsw m0, m9 + pshufb m2, m12, m10 + pmaddubsw m2, m9 + pshufb m1, m13, m10 + pmaddubsw m1, m9 + pshufb m6, m14, m7 + pmaddubsw m6, m9 + vpbroadcastd m9, [z_filter_k+4*2+12*1] + pshufb m7, m11, m15 + pmaddubsw m7, m9 + paddw m0, m7 + pshufb m7, m12, m15 + pmaddubsw m7, m9 + paddw m2, m7 + pshufb m7, m13, m15 + pmaddubsw m7, m9 + paddw m1, m7 + pshufb m7, m14, m10 + pmaddubsw m7, m9 + paddw m6, m7 + vpbroadcastd m9, [z_filter_k+4*2+12*0] + pshufb m11, m8 + pmaddubsw m11, m9 + pshufb m12, m8 + pmaddubsw m12, m9 + pshufb m13, m8 + pmaddubsw m13, m9 + pshufb m14, m15 + pmaddubsw m14, m9 + paddw m0, m11 + paddw m2, m12 + paddw m1, m13 + paddw m6, m14 + pmulhrsw m0, m3 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + pmulhrsw m6, m3 + packuswb m0, m2 + packuswb m1, m6 + mova [tlq-63], m0 + mova [tlq-31], m1 +.h64_main: + movd xm12, dyd + neg maxbaseq + vbroadcasti128 m8, [z3_shuf] + vpbroadcastb m7, [tlq+maxbaseq] + shl maxbased, 6 + vpbroadcastw m12, xm12 + lea r5d, [dyq+maxbaseq-64] + neg dyq + or maxbased, 63 + lea r4, [dyq+63] + movd xm6, r5d + mova xm10, [pb_1to32+16] + vinserti128 m10, [pb_1to32], 1 + vpbroadcastd m11, [pb_32] + vpbroadcastw m6, xm6 +.h64_loop: + mov r5, r4 + sar r5, 6 + movu m0, [tlq+r5-24] + movu m1, [tlq+r5-32] + pand m2, m4, m6 + psubw m9, m5, m2 + psllw m2, 8 + por m9, m2 + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + psraw m2, m6, 6 + sub rsp, 64 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packsswb m2, m2 + paddb m2, m10 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [rsp+32], m0 + movu m0, [tlq+r5-56] + movu m1, [tlq+r5-64] + add r4, dyq + pshufb m0, m8 + pshufb m1, m8 + pmaddubsw m0, m9 + pmaddubsw m1, m9 + paddb m2, m11 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + paddw m6, m12 + packuswb m0, m1 + vpblendvb m0, m7, m0, m2 + mova [rsp], m0 + dec wd + jz .h64_transpose + cmp r4d, maxbased + jg .h64_loop +.h64_end_loop: + sub rsp, 64 + mova [rsp+32], m7 + mova [rsp+ 0], m7 + dec wd + jg .h64_end_loop +.h64_transpose: + lea r2, [strideq*3] + lea r3, [strideq*5] + imul r5, strideq, -8 + lea dstq, [dstq+org_wq-16] + lea r4, [strideq+r2*2] ; stride*7 +.h64_transpose_loop0: + lea r6, [rsp+16*3] +.h64_transpose_loop: + mova xm0, [r6+64*15] + vinserti128 m0, [r6+64* 7], 1 + mova xm1, [r6+64*14] + vinserti128 m1, [r6+64* 6], 1 + mova xm2, [r6+64*13] + vinserti128 m2, [r6+64* 5], 1 + mova xm3, [r6+64*12] + vinserti128 m3, [r6+64* 4], 1 + mova xm4, [r6+64*11] + vinserti128 m4, [r6+64* 3], 1 + mova xm5, [r6+64*10] + vinserti128 m5, [r6+64* 2], 1 + mova xm6, [r6+64* 9] + vinserti128 m6, [r6+64* 1], 1 + mova xm7, [r6+64* 8] + vinserti128 m7, [r6+64* 0], 1 + sub r6, 16 + punpcklbw m8, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklwd m7, m8, m1 + punpckhwd m8, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpckldq m6, m7, m2 + punpckhdq m7, m2 + punpckldq m2, m8, m3 + punpckhdq m8, m3 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + punpckldq m5, m0, m4 + punpckhdq m0, m4 + vpermq m6, m6, q3120 + vpermq m7, m7, q3120 + vpermq m2, m2, q3120 + vpermq m8, m8, q3120 + vpermq m3, m3, q3120 + vpermq m1, m1, q3120 + vpermq m5, m5, q3120 + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm6 + vextracti128 [dstq+strideq*1], m6, 1 + mova [dstq+strideq*2], xm7 + vextracti128 [dstq+r2 ], m7, 1 + mova [dstq+strideq*4], xm2 + vextracti128 [dstq+r3 ], m2, 1 + mova [dstq+r2*2 ], xm8 + vextracti128 [dstq+r4 ], m8, 1 + sub dstq, r5 + mova [dstq+strideq*0], xm3 + vextracti128 [dstq+strideq*1], m3, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r2 ], m1, 1 + mova [dstq+strideq*4], xm5 + vextracti128 [dstq+r3 ], m5, 1 + mova [dstq+r2*2 ], xm0 + vextracti128 [dstq+r4 ], m0, 1 + sub dstq, r5 + cmp r6, rsp + jae .h64_transpose_loop + add rsp, 64*16 + lea dstq, [dstq+r5*8-16] + sub org_wd, 16 + jg .h64_transpose_loop0 +.h64_end: + RET + +%macro FILTER_XMM 4 ; dst, src, tmp, shuf +%ifnum %4 + pshufb xm%2, xm%4 +%else + pshufb xm%2, %4 +%endif + pshufd xm%1, xm%2, q0000 ; p0 p1 + pmaddubsw xm%1, xm2 + pshufd xm%3, xm%2, q1111 ; p2 p3 + pmaddubsw xm%3, xm3 + paddw xm%1, xm1 + paddw xm%1, xm%3 + pshufd xm%3, xm%2, q2222 ; p4 p5 + pmaddubsw xm%3, xm4 + paddw xm%1, xm%3 + pshufd xm%3, xm%2, q3333 ; p6 __ + pmaddubsw xm%3, xm5 + paddw xm%1, xm%3 + psraw xm%1, 4 + packuswb xm%1, xm%1 +%endmacro + +%macro FILTER_YMM 4 ; dst, src, tmp, shuf + pshufb m%2, m%4 + pshufd m%1, m%2, q0000 + pmaddubsw m%1, m2 + pshufd m%3, m%2, q1111 + pmaddubsw m%3, m3 + paddw m%1, m1 + paddw m%1, m%3 + pshufd m%3, m%2, q2222 + pmaddubsw m%3, m4 + paddw m%1, m%3 + pshufd m%3, m%2, q3333 + pmaddubsw m%3, m5 + paddw m%1, m%3 + psraw m%1, 4 + vperm2i128 m%3, m%1, m%1, 0x01 + packuswb m%1, m%3 +%endmacro + +; The ipred_filter SIMD processes 4x2 blocks in the following order which +; increases parallelism compared to doing things row by row. One redundant +; block is calculated for w8 and w16, two for w32. +; w4 w8 w16 w32 +; 1 1 2 1 2 3 5 1 2 3 5 b c d f +; 2 2 3 2 4 5 7 2 4 5 7 c e f h +; 3 3 4 4 6 7 9 4 6 7 9 e g h j +; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ +; 5 8 8 i + +cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter +%define base r6-ipred_filter_avx2_table + lea r6, [filter_intra_taps] + tzcnt wd, wm +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + add filterq, r6 + lea r6, [ipred_filter_avx2_table] + movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 + movsxd wq, [r6+wq*4] + vpbroadcastd m1, [base+pw_8] + vbroadcasti128 m2, [filterq+16*0] + vbroadcasti128 m3, [filterq+16*1] + vbroadcasti128 m4, [filterq+16*2] + vbroadcasti128 m5, [filterq+16*3] + add wq, r6 + mov hd, hm + jmp wq +.w4: + WIN64_SPILL_XMM 9 + mova xm8, [base+filter_shuf2] + sub tlq, 3 + sub tlq, hq + jmp .w4_loop_start +.w4_loop: + pinsrd xm0, xm6, [tlq+hq], 0 + lea dstq, [dstq+strideq*2] +.w4_loop_start: + FILTER_XMM 6, 0, 7, 8 + movd [dstq+strideq*0], xm6 + pextrd [dstq+strideq*1], xm6, 1 + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.w8: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 10 + mova m8, [base+filter_shuf1] + FILTER_XMM 7, 0, 6, [base+filter_shuf2] + vpbroadcastd m0, [tlq+4] + vpbroadcastd m6, [tlq+5] + sub tlq, 4 + sub tlq, hq + vpbroadcastq m7, xm7 + vpblendd m7, m6, 0x20 +.w8_loop: + vpbroadcastd xm6, [tlq+hq] + palignr m6, m0, 12 + vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + mova xm6, xm7 + call .main + vpblendd xm6, xm7, 0x0c + pshufd xm6, xm6, q3120 + movq [dstq+strideq*0], xm6 + movhps [dstq+strideq*1], xm6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + %assign xmm_regs_used 15 + %assign stack_size_padded 0x98 + SUB rsp, stack_size_padded +%endif + sub hd, 2 + TAIL_CALL .w16_main, 0 +.w16_main: +%if WIN64 + movaps [rsp+0xa8], xmm6 + movaps [rsp+0xb8], xmm7 + movaps [rsp+0x28], xmm8 + movaps [rsp+0x38], xmm9 + movaps [rsp+0x48], xmm10 + movaps [rsp+0x58], xmm11 + movaps [rsp+0x68], xmm12 + movaps [rsp+0x78], xmm13 + movaps [rsp+0x88], xmm14 +%endif + FILTER_XMM 12, 0, 7, [base+filter_shuf2] + vpbroadcastd m0, [tlq+5] + vpblendd m0, [tlq-12], 0x14 + mova m8, [base+filter_shuf1] + vpbroadcastq m7, xm12 + vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + call .main ; c0 d0 a1 b1 a1 b1 c0 d0 + movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + vinserti128 m14, m8, [base+filter_shuf3], 0 + vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 + FILTER_XMM 6, 9, 10, 14 + vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 + vpbroadcastd m9, [tlq+13] + vpbroadcastd m10, [tlq+12] + psrld m11, m8, 4 + vpblendd m6, m9, 0x20 ; top + sub tlq, 6 + sub tlq, hq +.w16_loop: + vpbroadcastd xm9, [tlq+hq] + palignr m9, m0, 12 + vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + mova xm13, xm7 + call .main ; e0 f0 c1 d1 c1 d1 e0 f0 + vpblendd m9, m12, m10, 0xf0 + vpblendd m12, m6, 0xc0 + pshufd m9, m9, q3333 + vpblendd m9, m6, 0xee + vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 + vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 + vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 + vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 + mova [dstq+strideq*0], xm9 + vextracti128 [dstq+strideq*1], m9, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 + pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] + vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 + shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 + shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm6 + ret +ALIGN function_align +.w32: + sub rsp, stack_size_padded + sub hd, 2 + lea r3, [dstq+16] + lea r5d, [hq-2] + call .w16_main + add tlq, r5 + mov dstq, r3 + lea r3, [strideq-4] + lea r4, [r3+strideq*2] + movq xm0, [tlq+21] + pinsrd xm0, [dstq-4], 2 + pinsrd xm0, [dstq+r3*1], 3 + FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0 + movq xm7, [dstq+r3*2] + pinsrd xm7, [dstq+r4], 2 + palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6 + vpbroadcastd m0, [tlq+28] + vpbroadcastd m9, [tlq+29] + vbroadcasti128 m8, [base+filter_shuf1+16] + vpblendd m0, m9, 0x20 + vpblendd m0, m7, 0x0f + vpbroadcastq m7, xm12 + vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + call .main ; c0 d0 a1 b1 a1 b1 c0 d0 + add r3, 2 + lea r4, [r4+strideq*2] + movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 + FILTER_XMM 6, 9, 10, 14 + vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 + vpbroadcastd m9, [tlq+37] + vpbroadcastd m10, [tlq+36] + vpblendd m6, m9, 0x20 ; top +.w32_loop: + movq xm9, [dstq+r3*4] + pinsrd xm9, [dstq+r4], 2 +.w32_loop_last: + palignr m9, m0, 12 + vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + mova xm13, xm7 ; c0 d0 + call .main ; e0 f0 c1 d1 c1 d1 e0 f0 + vpblendd m9, m12, m10, 0xf0 + vpblendd m12, m6, 0xc0 + pshufd m9, m9, q3333 + vpblendd m9, m6, 0xee + vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 + vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 + vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 + vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 + mova [dstq+strideq*0], xm9 + vextracti128 [dstq+strideq*1], m9, 1 + lea dstq, [dstq+strideq*2] + sub r5d, 2 + jg .w32_loop + jz .w32_loop_last + vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 + pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] + vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 + shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 + shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm6 + RET +ALIGN function_align +.main: + FILTER_YMM 7, 0, 9, 8 + ret + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +%macro IPRED_CFL 1 ; ac in, unpacked pixels out + psignw m3, m%1, m1 + pabsw m%1, m%1 + pmulhrsw m%1, m2 + psignw m%1, m3 + paddw m%1, m0 +%endmacro + +cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + lea t0, [ipred_cfl_left_avx2_table] + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + mov r6d, 0x8000 + shrx r6d, r6d, wd + movd xm3, r6d + movsxd r6, [t0+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 + +cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + mov t0d, 0x8000 + shrx t0d, t0d, r6d + movd xm3, t0d + lea t0, [ipred_cfl_left_avx2_table] + movsxd r6, [t0+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h32: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h16: + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 +.h8: + psrlq xm1, xm0, 32 + paddw xm0, xm1 +.h4: + pmaddwd xm0, xm2 + pmulhrsw xm0, xm3 + vpbroadcastw m0, xm0 + jmp wq + +cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd xm4, t0d + tzcnt t0d, t0d + movd xm5, t0d + lea t0, [ipred_cfl_avx2_table] + tzcnt wd, wd + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+4*4] + pcmpeqd m3, m3 + psrlw xm4, 1 + add r6, t0 + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h4: + movd xm0, [tlq-4] + pmaddubsw xm0, xm3 + jmp wq +.w4: + movd xm1, [tlq+1] + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm1 + pmaddwd xm0, xm3 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + punpckhqdq xm1, xm0, xm0 + lea r2d, [hq*2] + mov r6d, 0x55563334 + paddw xm0, xm1 + shrx r6d, r6d, r2d + psrlq xm1, xm0, 32 + paddw xm0, xm1 + movd xm1, r6d + psrlw xm0, 2 + pmulhuw xm0, xm1 +.w4_end: + vpbroadcastw m0, xm0 +.s4: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq] + IPRED_CFL 4 + packuswb m4, m4 + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + movd [dstq+strideq*2], xm5 + pextrd [dstq+r6 ], xm5, 1 + lea dstq, [dstq+strideq*4] + add acq, 32 + sub hd, 4 + jg .s4_loop + RET +ALIGN function_align +.h8: + movq xm0, [tlq-8] + pmaddubsw xm0, xm3 + jmp wq +.w8: + movq xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + punpckhqdq xm2, xm0, xm0 + paddw xm0, xm2 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmove r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w8_end: + vpbroadcastw m0, xm0 +.s8: + vpbroadcastw m1, alpham + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 + movhps [dstq+strideq*2], xm4 + movhps [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + add acq, 64 + sub hd, 4 + jg .s8_loop + RET +ALIGN function_align +.h16: + mova xm0, [tlq-16] + pmaddubsw xm0, xm3 + jmp wq +.w16: + movu xm1, [tlq+1] + vextracti128 xm2, m0, 1 + pmaddubsw xm1, xm3 + psubw xm0, xm4 + paddw xm0, xm2 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w16_end: + vpbroadcastw m0, xm0 +.s16: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + vpermq m4, m4, q3120 + mova [dstq+strideq*0], xm4 + vextracti128 [dstq+strideq*1], m4, 1 + lea dstq, [dstq+strideq*2] + add acq, 64 + sub hd, 2 + jg .s16_loop + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + vextracti128 xm1, m0, 1 + psubw xm0, xm4 + paddw xm0, xm1 + punpckhqdq xm1, xm0, xm0 + paddw xm0, xm1 + psrlq xm1, xm0, 32 + paddw xm0, xm1 + pmaddwd xm0, xm3 + psrlw xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x33345556 + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 +.w32_end: + vpbroadcastw m0, xm0 +.s32: + vpbroadcastw m1, alpham + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq] + mova m5, [acq+32] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + vpermq m4, m4, q3120 + mova [dstq], m4 + add dstq, strideq + add acq, 64 + dec hd + jg .s32_loop + RET + +cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + lea t0, [ipred_cfl_splat_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [t0+wq*4] + vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128] + add wq, t0 + movifnidn acq, acmp + jmp wq + +cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + mov ac_bakq, acq + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + vpbroadcastd m2, [pb_2] + pxor m4, m4 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq xm0, [yq] + movq xm1, [yq+strideq] + movhps xm0, [yq+strideq*2] + movhps xm1, [yq+stride3q] + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm2 + paddw xm0, xm1 + mova [acq], xm0 + paddw xm4, xm0 + lea yq, [yq+strideq*4] + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg + vpermq m0, m0, q1111 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg + +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova xm0, [yq] + mova xm1, [yq+strideq] + vinserti128 m0, [yq+strideq*2], 1 + vinserti128 m1, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 2 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg + jmp .w8_hpad +.w8_wpad: + vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] +.w8_wpad_loop: + movq xm0, [yq] + movq xm1, [yq+strideq] + vinserti128 m0, [yq+strideq*2], 1 + vinserti128 m1, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufb m0, m3 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 2 + jg .w8_wpad_loop + test hpadd, hpadd + jz .calc_avg +.w8_hpad: + vpermq m0, m0, q3232 +.w8_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad_loop + jmp .calc_avg + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_loop + test hpadd, hpadd + jz .calc_avg + jmp .w16_hpad_loop +.w16_wpad: + DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_420_avx2_table] + shl wpadd, 2 + mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ + ipred_cfl_ac_420_avx2_table+wpadq*8-32] + movsxd wpadq, [iptrq+wpadq+4] + add iptrq, wpadq + jmp iptrq +.w16_pad3: + vpbroadcastq m0, [yq] + vpbroadcastq m1, [yq+strideq] + jmp .w16_wpad_end +.w16_pad2: + vbroadcasti128 m0, [yq] + vbroadcasti128 m1, [yq+strideq] + jmp .w16_wpad_end +.w16_pad1: + mova m0, [yq] + mova m1, [yq+strideq] + ; fall-through +.w16_wpad_end: + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufb m0, m3 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jz .w16_wpad_done + jmp iptrq +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg +.w16_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + dec hpadd + jg .w16_hpad_loop + ; fall-through + +.calc_avg: + vpbroadcastd m2, [pw_1] + pmaddwd m0, m4, m2 + vextracti128 xm1, m0, 1 + tzcnt r1d, szd + paddd xm0, xm1 + movd xm2, r1d + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrad xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrad xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + mov ac_bakq, acq + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + vpbroadcastd m2, [pb_4] + pxor m4, m4 + pxor m5, m5 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq xm1, [yq] + movhps xm1, [yq+strideq] + movq xm0, [yq+strideq*2] + movhps xm0, [yq+stride3q] + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm2 + mova [acq], xm1 + mova [acq+16], xm0 + paddw xm4, xm0 + paddw xm5, xm1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg + vpermq m0, m0, q1111 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg + +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova xm1, [yq] + vinserti128 m1, [yq+strideq], 1 + mova xm0, [yq+strideq*2] + vinserti128 m0, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg + jmp .w8_hpad +.w8_wpad: + vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] +.w8_wpad_loop: + movq xm1, [yq] + vinserti128 m1, [yq+strideq], 1 + movq xm0, [yq+strideq*2] + vinserti128 m0, [yq+stride3q], 1 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pshufb m0, m3 + pshufb m1, m3 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_wpad_loop + test hpadd, hpadd + jz .calc_avg +.w8_hpad: + vpermq m0, m0, q3232 +.w8_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad_loop + jmp .calc_avg + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m1, [yq] + mova m0, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg + jmp .w16_hpad_loop +.w16_wpad: + DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_422_avx2_table] + shl wpadd, 2 + mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ + ipred_cfl_ac_422_avx2_table+wpadq*8-32] + movsxd wpadq, [iptrq+wpadq+4] + add iptrq, wpadq + jmp iptrq +.w16_pad3: + vpbroadcastq m1, [yq] + vpbroadcastq m0, [yq+strideq] + jmp .w16_wpad_end +.w16_pad2: + vbroadcasti128 m1, [yq] + vbroadcasti128 m0, [yq+strideq] + jmp .w16_wpad_end +.w16_pad1: + mova m1, [yq] + mova m0, [yq+strideq] + ; fall-through +.w16_wpad_end: + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pshufb m0, m3 + pshufb m1, m3 + mova [acq], m1 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jz .w16_wpad_done + jmp iptrq +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg +.w16_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddw m4, m0 + paddw m5, m0 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop + ; fall-through + +.calc_avg: + vpbroadcastd m2, [pw_1] + pmaddwd m5, m5, m2 + pmaddwd m0, m4, m2 + paddd m0, m5 + vextracti128 xm1, m0, 1 + tzcnt r1d, szd + paddd xm0, xm1 + movd xm2, r1d + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrad xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrad xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + pxor m4, m4 + vpbroadcastd m5, [pw_1] + tzcnt r8d, wd + lea r5, [ipred_cfl_ac_444_avx2_table] + movsxd r8, [r5+r8*4+12] + add r5, r8 + + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak + mov ac_bakq, acq + jmp r5 + +.w4: + lea stride3q, [strideq*3] + pxor xm2, xm2 +.w4_loop: + movd xm1, [yq] + movd xm0, [yq+strideq*2] + pinsrd xm1, [yq+strideq], 1 + pinsrd xm0, [yq+stride3q], 1 + punpcklbw xm1, xm2 + punpcklbw xm0, xm2 + psllw xm1, 3 + psllw xm0, 3 + mova [acq], xm1 + mova [acq+16], xm0 + paddw xm1, xm0 + paddw xm4, xm1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_mul + pshufd xm0, xm0, q3232 + paddw xm1, xm0, xm0 +.w4_hpad_loop: + mova [acq], xm0 + mova [acq+16], xm0 + paddw xm4, xm1 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg_mul + +.w8: + lea stride3q, [strideq*3] + pxor m2, m2 +.w8_loop: + movq xm1, [yq] + movq xm0, [yq+strideq*2] + vinserti128 m1, [yq+strideq], 1 + vinserti128 m0, [yq+stride3q], 1 + punpcklbw m1, m2 + punpcklbw m0, m2 + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + paddw m4, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_mul + vpermq m0, m0, q3232 + paddw m1, m0, m0 +.w8_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddw m4, m1 + add acq, 64 + sub hpadd, 4 + jg .w8_hpad_loop + jmp .calc_avg_mul + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + pmovzxbw m1, [yq] + pmovzxbw m0, [yq+strideq] + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + pmaddwd m1, m5 + paddd m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg + jmp .w16_hpad +.w16_wpad: + mova m3, [cfl_ac_444_w16_pad1_shuffle] +.w16_wpad_loop: + vpbroadcastq m1, [yq] + vpbroadcastq m0, [yq+strideq] + pshufb m1, m3 + pshufb m0, m3 + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + pmaddwd m1, m5 + paddd m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_wpad_loop + test hpadd, hpadd + jz .calc_avg +.w16_hpad: + paddw m1, m0, m0 + pmaddwd m1, m5 +.w16_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddd m4, m1 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop + jmp .calc_avg + +.w32: + test wpadd, wpadd + jnz .w32_wpad +.w32_loop: + pmovzxbw m1, [yq] + pmovzxbw m0, [yq+16] + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m2, m1, m0 + pmaddwd m2, m5 + paddd m4, m2 + add yq, strideq + add acq, 64 + dec hd + jg .w32_loop + test hpadd, hpadd + jz .calc_avg + jmp .w32_hpad_loop +.w32_wpad: + DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_444_avx2_table] + add wpadd, wpadd + mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table] + movsxd wpadq, [iptrq+wpadq+4] + add iptrq, wpadq + jmp iptrq +.w32_pad3: + vpbroadcastq m1, [yq] + pshufb m1, m3 + vpermq m0, m1, q3232 + jmp .w32_wpad_end +.w32_pad2: + pmovzxbw m1, [yq] + pshufhw m0, m1, q3333 + vpermq m0, m0, q3333 + jmp .w32_wpad_end +.w32_pad1: + pmovzxbw m1, [yq] + vpbroadcastq m0, [yq+16] + pshufb m0, m3 + ; fall-through +.w32_wpad_end: + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m2, m1, m0 + pmaddwd m2, m5 + paddd m4, m2 + add yq, strideq + add acq, 64 + dec hd + jz .w32_wpad_done + jmp iptrq +.w32_wpad_done: + test hpadd, hpadd + jz .calc_avg +.w32_hpad_loop: + mova [acq], m1 + mova [acq+32], m0 + paddd m4, m2 + add acq, 64 + dec hpadd + jg .w32_hpad_loop + jmp .calc_avg + +.calc_avg_mul: + pmaddwd m4, m5 +.calc_avg: + vextracti128 xm1, m4, 1 + tzcnt r1d, szd + paddd xm0, xm4, xm1 + movd xm2, r1d + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrad xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrad xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + +cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h + vbroadcasti128 m4, [palq] + lea r2, [pal_pred_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r2+wq*4] + packuswb m4, m4 + add wq, r2 + lea r2, [strideq*3] + jmp wq +.w4: + pshufb xm0, xm4, [idxq] + add idxq, 16 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +ALIGN function_align +.w8: + pshufb xm0, xm4, [idxq+16*0] + pshufb xm1, xm4, [idxq+16*1] + add idxq, 16*2 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +ALIGN function_align +.w16: + pshufb m0, m4, [idxq+32*0] + pshufb m1, m4, [idxq+32*1] + add idxq, 32*2 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r2 ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +ALIGN function_align +.w32: + pshufb m0, m4, [idxq+32*0] + pshufb m1, m4, [idxq+32*1] + pshufb m2, m4, [idxq+32*2] + pshufb m3, m4, [idxq+32*3] + add idxq, 32*4 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r2 ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32 + RET +ALIGN function_align +.w64: + pshufb m0, m4, [idxq+32*0] + pshufb m1, m4, [idxq+32*1] + pshufb m2, m4, [idxq+32*2] + pshufb m3, m4, [idxq+32*3] + add idxq, 32*4 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w64 + RET + +%endif diff --git a/third_party/dav1d/src/x86/ipred_avx512.asm b/third_party/dav1d/src/x86/ipred_avx512.asm new file mode 100644 index 0000000000..38c86b54f5 --- /dev/null +++ b/third_party/dav1d/src/x86/ipred_avx512.asm @@ -0,0 +1,1432 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +%macro SMOOTH_WEIGHT_TABLE 1-* + %rep %0 + db %1-128, 127-%1 + %rotate 1 + %endrep +%endmacro + +smooth_weights: SMOOTH_WEIGHT_TABLE \ + 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __ +filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10 + db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6 + db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0 + db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0 + db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 + db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 + db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0 + db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0 + db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8 + db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4 + db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0 + db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0 + db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8 + db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4 + db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0 + db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0 + db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14 + db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12 + db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0 + db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0 +filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31 + db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131 + db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147 + db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163 +filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31 +smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9 + db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13 + db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11 + db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15 +smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 + db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 +smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 + db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95 + db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111 + db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 +ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 + db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 + +pb_127_m127: times 2 db 127, -127 +pb_128: times 4 db 128 +pw_128: times 2 dw 128 +pw_255: times 2 dw 255 + +%define pb_1 (ipred_h_shuf+24) +%define pb_2 (ipred_h_shuf+20) +%define pb_3 (ipred_h_shuf+16) +%define pd_8 (filter_taps+128) + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4) + +JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 +JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64 + +SECTION .text + +INIT_ZMM avx512icl +cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h + lea r5, [ipred_dc_left_8bpc_avx512icl_table] + movd xm0, wm + tzcnt wd, wm + inc tlq + movifnidn hd, hm + movu ym1, [tlq] + movd xmm3, wd + movsxd r6, [r5+wq*4] + vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] + psrld xm0, 1 + vpdpbusd ym0, ym1, ym2 + add r6, r5 + add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_left_8bpc_avx512icl_table] + mov hd, hm + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movd xm0, hm + movu ym1, [tlq] + movd xmm3, r6d + movsxd r6, [r5+r6*4] + vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] + psrld xm0, 1 + vpdpbusd ym0, ym1, ym2 + add r6, r5 + add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu ym1, [tlq+32] ; unaligned when jumping here from dc_top + vpdpbusd ym0, ym1, ym2 +.h32: + vextracti32x4 xm1, ym0, 1 + paddd xm0, xm1 +.h16: + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 +.h8: + psrlq xm1, xm0, 32 + paddd xm0, xm1 +.h4: + vpsrlvd xm0, xmm3 + lea stride3q, [strideq*3] + vpbroadcastb m0, xm0 + jmp wq + +cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd xm0, r5d + tzcnt r5d, r5d + movd xmm4, r5d + lea r5, [ipred_dc_8bpc_avx512icl_table] + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1] + psrld xm0, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movd xmm1, [tlq-4] + vpdpbusd xm0, xmm1, xm3 + jmp wq +.w4: + movd xmm1, [tlq+1] + vpdpbusd xm0, xmm1, xm3 + cmp hd, 4 + jg .w4_mul + psrlw xmm0, xm0, 3 + jmp .w4_end +.w4_mul: + punpckhqdq xmm1, xm0, xm0 + lea r2d, [hq*2] + mov r6d, 0x55563334 + paddd xmm1, xm0 + shrx r6d, r6d, r2d + psrlq xmm0, xmm1, 32 + paddd xmm0, xmm1 + movd xmm1, r6d + psrld xmm0, 2 + pmulhuw xmm0, xmm1 +.w4_end: + vpbroadcastb xm0, xmm0 +.s4: + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm0 + movd [dstq+strideq*2], xm0 + movd [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +.h8: + movq xmm1, [tlq-8] + vpdpbusd xm0, xmm1, xm3 + jmp wq +.w8: + movq xmm1, [tlq+1] + vextracti32x4 xm2, ym0, 1 + vpdpbusd xm0, xmm1, xm3 + paddd xmm2, xm2, xm0 + punpckhqdq xmm0, xmm2, xmm2 + paddd xmm0, xmm2 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmove r6d, r2d + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w8_end: + vpbroadcastb xm0, xmm0 +.s8: + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 + movq [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +.h16: + mova xmm1, [tlq-16] + vpdpbusd xm0, xmm1, xm3 + jmp wq +.w16: + movu xmm1, [tlq+1] + vextracti32x4 xm2, ym0, 1 + vpdpbusd xm0, xmm1, xm3 + paddd xmm2, xm2, xm0 + punpckhqdq xmm0, xmm2, xmm2 + paddd xmm0, xmm2 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hb, 8|32 + cmovz r6d, r2d + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w16_end: + vpbroadcastb xm0, xmm0 +.s16: + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +.h32: + mova ym1, [tlq-32] + vpdpbusd ym0, ym1, ym3 + jmp wq +.w32: + movu ym1, [tlq+1] + vpdpbusd ym0, ym1, ym3 + vextracti32x4 xm1, ym0, 1 + paddd xmm1, xm1, xm0 + punpckhqdq xmm0, xmm1, xmm1 + paddd xmm0, xmm1 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x33345556 + shrx r6d, r6d, r2d + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w32_end: + vpbroadcastb ym0, xmm0 +.s32: + mova [dstq+strideq*0], ym0 + mova [dstq+strideq*1], ym0 + mova [dstq+strideq*2], ym0 + mova [dstq+stride3q ], ym0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +.h64: + mova ym1, [tlq-64] + mova ym2, [tlq-32] + vpdpbusd ym0, ym1, ym3 + vpdpbusd ym0, ym2, ym3 + jmp wq +.w64: + movu ym1, [tlq+ 1] + movu ym2, [tlq+33] + vpdpbusd ym0, ym1, ym3 + vpdpbusd ym0, ym2, ym3 + vextracti32x4 xm1, ym0, 1 + paddd xmm1, xm1, xm0 + punpckhqdq xmm0, xmm1, xmm1 + paddd xmm0, xmm1 + psrlq xmm1, xmm0, 32 + paddd xmm0, xmm1 + vpsrlvd xmm0, xmm4 + cmp hd, 64 + je .w64_end + mov r6d, 0x33345556 + shrx r6d, r6d, hd + movd xmm1, r6d + pmulhuw xmm0, xmm1 +.w64_end: + vpbroadcastb m0, xmm0 +.s64: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s64 + RET + +cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_8bpc_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 + lea r5, [ipred_dc_splat_8bpc_avx512icl_table] + tzcnt wd, wm + movu m0, [tlq+1] + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3 +%define base r6-ipred_h_8bpc_avx512icl_table + lea r6, [ipred_h_8bpc_avx512icl_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + lea stride3q, [strideq*3] + sub tlq, hq + add wq, r6 + jmp wq +.w4: + mova xmm1, [base+ipred_h_shuf+16] +.w4_loop: + movd xmm0, [tlq+hq-4] + pshufb xmm0, xmm1 + movd [dstq+strideq*0], xmm0 + pextrd [dstq+strideq*1], xmm0, 1 + pextrd [dstq+strideq*2], xmm0, 2 + pextrd [dstq+stride3q ], xmm0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +.w8: + movsldup xmm2, [base+ipred_h_shuf+16] + movshdup xmm3, [base+ipred_h_shuf+16] +.w8_loop: + movd xmm1, [tlq+hq-4] + pshufb xmm0, xmm1, xmm2 + pshufb xmm1, xmm3 + movq [dstq+strideq*0], xmm0 + movq [dstq+strideq*1], xmm1 + movhps [dstq+strideq*2], xmm0 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + movsldup m1, [base+smooth_shuf] +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + pshufb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + vpbroadcastd ym3, [base+pb_1] + vpord m2, m3, [base+pb_2] {1to16} +.w32_loop: + vpbroadcastd m1, [tlq+hq-4] + pshufb m0, m1, m2 + pshufb m1, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32_loop + RET +.w64: + vpbroadcastd m4, [base+pb_3] + vpbroadcastd m5, [base+pb_2] + vpbroadcastd m6, [base+pb_1] + pxor m7, m7 +.w64_loop: + vpbroadcastd m3, [tlq+hq-4] + pshufb m0, m3, m4 + pshufb m1, m3, m5 + pshufb m2, m3, m6 + pshufb m3, m7 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w64_loop + RET + +%macro PAETH 0 + psubusb m1, m5, m4 + psubusb m0, m4, m5 + por m1, m0 ; tdiff + pavgb m2, m6, m4 + vpcmpub k1, m1, m7, 1 ; tdiff < ldiff + vpblendmb m0{k1}, m4, m6 + vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8 + psubusb m3, m5, m2 + psubb m2, m4 + psubusb m2, m5 + por m2, m3 + pminub m1, m7 + paddusb m2, m2 + por m2, m4 ; min(tldiff, 255) + vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff + vmovdqu8 m0{k1}, m5 +%endmacro + +cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3 + lea r6, [ipred_paeth_8bpc_avx512icl_table] + tzcnt wd, wm + vpbroadcastb m5, [tlq] ; topleft + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1] + lea topq, [tlq+1] + sub tlq, hq + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +INIT_YMM avx512icl +.w4: + vpbroadcastd m6, [topq] + mova m9, [ipred_h_shuf] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 ; ldiff +.w4_loop: + vpbroadcastq m4, [tlq+hq-8] + pshufb m4, m9 ; left + PAETH + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm0, 3 + sub hd, 8 + jl .w4_ret + vextracti32x4 xm0, m0, 1 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm0, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_ret: + RET +INIT_ZMM avx512icl +.w8: + vpbroadcastq m6, [topq] + movsldup m9, [smooth_shuf] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w8_loop: + vpbroadcastq m4, [tlq+hq-8] + pshufb m4, m9 + PAETH + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 + sub hd, 8 + jl .w8_ret + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + jg .w8_loop +.w8_ret: + RET +.w16: + vbroadcasti32x4 m6, [topq] + movsldup m9, [smooth_shuf] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w16_loop: + vpbroadcastd m4, [tlq+hq-4] + pshufb m4, m9 + PAETH + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + vbroadcasti32x8 m6, [topq] + mova ym9, ym8 + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w32_loop: + vpbroadcastd m4, [tlq+hq-2] + pshufb m4, m9 + PAETH + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + movu m6, [topq] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w64_loop: + vpbroadcastb m4, [tlq+hq-1] + PAETH + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 +%define base r6-ipred_smooth_v_8bpc_avx512icl_table + lea r6, [ipred_smooth_v_8bpc_avx512icl_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m0, [base+pb_127_m127] + vpbroadcastd m1, [base+pw_128] + lea weightsq, [base+smooth_weights+hq*4] + neg hq + vpbroadcastb m4, [tlq+hq] ; bottom + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.w4: + vpbroadcastd m2, [tlq+1] + movshdup m5, [smooth_shuf] + mova ym6, [smooth_endA] + punpcklbw m2, m4 ; top, bottom + pmaddubsw m3, m2, m0 + paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok + paddw m3, m1 ; 128 * top + 129 * bottom + 128 +.w4_loop: + vbroadcasti32x4 m0, [weightsq+hq*2] + pshufb m0, m5 + pmaddubsw m0, m2, m0 + paddw m0, m3 + vpermb m0, m6, m0 + vextracti32x4 xm1, ym0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm1, 2 + add hq, 8 + jg .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+stride3q ], xm1, 3 + lea dstq, [dstq+strideq*4] + jl .w4_loop +.ret: + RET +.w8: + vpbroadcastq m2, [tlq+1] + movshdup m5, [smooth_shuf] + mova ym6, [smooth_endA] + punpcklbw m2, m4 + pmaddubsw m3, m2, m0 + paddw m1, m2 + paddw m3, m1 +.w8_loop: + vpbroadcastq m0, [weightsq+hq*2] + pshufb m0, m5 + pmaddubsw m0, m2, m0 + paddw m0, m3 + vpermb m0, m6, m0 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w8_loop + RET +.w16: + vbroadcasti32x4 m3, [tlq+1] + movshdup m6, [smooth_shuf] + mova m7, [smooth_endB] + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w16_loop: + vpbroadcastq m1, [weightsq+hq*2] + pshufb m1, m6 + pmaddubsw m0, m2, m1 + pmaddubsw m1, m3, m1 + paddw m0, m4 + paddw m1, m5 + vpermt2b m0, m7, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w16_loop + RET +.w32: + vbroadcasti32x8 m3, [tlq+1] + movshdup m6, [smooth_shuf] + mova m7, [smooth_endB] + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w32_loop: + vpbroadcastd m1, [weightsq+hq*2] + pshufb m1, m6 + pmaddubsw m0, m2, m1 + pmaddubsw m1, m3, m1 + paddw m0, m4 + paddw m1, m5 + vpermt2b m0, m7, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w32_loop + RET +.w64: + movu m3, [tlq+1] + mova m6, [smooth_endB] + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 +.w64_loop: + vpbroadcastw m1, [weightsq+hq*2] + pmaddubsw m0, m2, m1 + pmaddubsw m1, m3, m1 + paddw m0, m4 + paddw m1, m5 + vpermt2b m0, m6, m1 + mova [dstq], m0 + add dstq, strideq + inc hq + jl .w64_loop + RET + +cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 +%define base r5-ipred_smooth_h_8bpc_avx512icl_table + lea r5, [ipred_smooth_h_8bpc_avx512icl_table] + mov r6d, wd + tzcnt wd, wd + vpbroadcastb m4, [tlq+r6] ; right + mov hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m5, [base+pb_127_m127] + vpbroadcastd m6, [base+pw_128] + sub tlq, hq + add wq, r5 + vpmovb2m k1, m6 + lea stride3q, [strideq*3] + jmp wq +.w4: + movsldup m3, [smooth_shuf] + vpbroadcastq m7, [smooth_weights+4*2] + mova ym8, [smooth_endA] +.w4_loop: + vpbroadcastq m0, [tlq+hq-8] + mova m2, m4 + vpshufb m2{k1}, m0, m3 ; left, right + pmaddubsw m0, m2, m5 + pmaddubsw m1, m2, m7 + paddw m2, m6 + paddw m0, m2 + paddw m0, m1 + vpermb m0, m8, m0 + vextracti32x4 xm1, ym0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm1, 2 + sub hd, 8 + jl .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+stride3q ], xm1, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.ret: + RET +.w8: + movsldup m3, [smooth_shuf] + vbroadcasti32x4 m7, [smooth_weights+8*2] + mova ym8, [smooth_endA] +.w8_loop: + vpbroadcastd m0, [tlq+hq-4] + mova m2, m4 + vpshufb m2{k1}, m0, m3 + pmaddubsw m0, m2, m5 + pmaddubsw m1, m2, m7 + paddw m2, m6 + paddw m0, m2 + paddw m0, m1 + vpermb m0, m8, m0 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + movsldup m7, [smooth_shuf] + vbroadcasti32x4 m8, [smooth_weights+16*2] + vbroadcasti32x4 m9, [smooth_weights+16*3] + mova m10, [smooth_endB] +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + mova m3, m4 + vpshufb m3{k1}, m0, m7 + pmaddubsw m2, m3, m5 + pmaddubsw m0, m3, m8 + pmaddubsw m1, m3, m9 + paddw m3, m6 + paddw m2, m3 + paddw m0, m2 + paddw m1, m2 + vpermt2b m0, m10, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + mova m10, [smooth_endA] + vpbroadcastd ym7, [pb_1] + vbroadcasti32x8 m8, [smooth_weights+32*2] + vbroadcasti32x8 m9, [smooth_weights+32*3] + vshufi32x4 m10, m10, q3120 +.w32_loop: + vpbroadcastd m0, [tlq+hq-2] + mova m3, m4 + vpshufb m3{k1}, m0, m7 + pmaddubsw m2, m3, m5 + pmaddubsw m0, m3, m8 + pmaddubsw m1, m3, m9 + paddw m3, m6 + paddw m2, m3 + paddw m0, m2 + paddw m1, m2 + vpermt2b m0, m10, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + mova m7, [smooth_weights+64*2] + mova m8, [smooth_weights+64*3] + mova m9, [smooth_endA] +.w64_loop: + mova m3, m4 + vpbroadcastb m3{k1}, [tlq+hq-1] + pmaddubsw m2, m3, m5 + pmaddubsw m0, m3, m7 + pmaddubsw m1, m3, m8 + paddw m3, m6 + paddw m2, m3 + paddw m0, m2 + paddw m1, m2 + vpermt2b m0, m9, m1 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 +%define base r5-ipred_smooth_8bpc_avx512icl_table + lea r5, [ipred_smooth_8bpc_avx512icl_table] + mov r6d, wd + tzcnt wd, wd + mov hd, hm + vpbroadcastb m6, [tlq+r6] ; right + sub tlq, hq + movsxd wq, [r5+wq*4] + vpbroadcastd m7, [base+pb_127_m127] + vpbroadcastb m0, [tlq] ; bottom + vpbroadcastd m1, [base+pw_255] + add wq, r5 + lea v_weightsq, [base+smooth_weights+hq*2] + vpmovb2m k1, m1 + lea stride3q, [strideq*3] + jmp wq +.w4: + vpbroadcastd m8, [tlq+hq+1] + movsldup m4, [smooth_shuf] + movshdup m5, [smooth_shuf] + vpbroadcastq m9, [smooth_weights+4*2] + mova ym11, [smooth_endA] + + punpcklbw m8, m0 ; top, bottom + pmaddubsw m10, m8, m7 + paddw m1, m8 ; 1 * top + 256 * bottom + 255 + paddw m10, m1 ; 128 * top + 129 * bottom + 255 +.w4_loop: + vpbroadcastq m1, [tlq+hq-8] + vbroadcasti32x4 m0, [v_weightsq] + add v_weightsq, 16 + mova m2, m6 + vpshufb m2{k1}, m1, m4 ; left, right + pmaddubsw m1, m2, m7 ; 127 * left - 127 * right + pshufb m0, m5 + pmaddubsw m0, m8, m0 + paddw m1, m2 ; 128 * left + 129 * right + pmaddubsw m2, m9 + paddw m0, m10 + paddw m1, m2 + pavgw m0, m1 + vpermb m0, m11, m0 + vextracti32x4 xm1, ym0, 1 + movd [dstq+strideq*0], xm0 + movd [dstq+strideq*1], xm1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm1, 2 + sub hd, 8 + jl .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 1 + pextrd [dstq+strideq*1], xm1, 1 + pextrd [dstq+strideq*2], xm0, 3 + pextrd [dstq+stride3q ], xm1, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.ret: + RET +.w8: + vpbroadcastq m8, [tlq+hq+1] + movsldup m4, [smooth_shuf] + movshdup m5, [smooth_shuf] + vbroadcasti32x4 m9, [smooth_weights+8*2] + mova ym11, [smooth_endA] + punpcklbw m8, m0 + pmaddubsw m10, m8, m7 + paddw m1, m8 + paddw m10, m1 +.w8_loop: + vpbroadcastd m1, [tlq+hq-4] + vpbroadcastq m0, [v_weightsq] + add v_weightsq, 8 + mova m2, m6 + vpshufb m2{k1}, m1, m4 + pmaddubsw m1, m2, m7 + pshufb m0, m5 + pmaddubsw m0, m8, m0 + paddw m1, m2 + pmaddubsw m2, m9 + paddw m0, m10 + paddw m1, m2 + pavgw m0, m1 + vpermb m0, m11, m0 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.w16: + vbroadcasti32x4 m9, [tlq+hq+1] + movsldup m5, [smooth_shuf] + movshdup m10, [smooth_shuf] + vbroadcasti32x4 m11, [smooth_weights+16*2] + vbroadcasti32x4 m12, [smooth_weights+16*3] + mova m15, [smooth_endB] + punpcklbw m8, m9, m0 + punpckhbw m9, m0 + pmaddubsw m13, m8, m7 + pmaddubsw m14, m9, m7 + paddw m0, m1, m8 + paddw m1, m9 + paddw m13, m0 + paddw m14, m1 +.w16_loop: + vpbroadcastd m0, [tlq+hq-4] + vpbroadcastq m1, [v_weightsq] + add v_weightsq, 8 + mova m4, m6 + vpshufb m4{k1}, m0, m5 + pmaddubsw m2, m4, m7 + pshufb m1, m10 + pmaddubsw m0, m8, m1 + pmaddubsw m1, m9, m1 + paddw m2, m4 + pmaddubsw m3, m4, m11 + pmaddubsw m4, m12 + paddw m0, m13 + paddw m1, m14 + paddw m3, m2 + paddw m4, m2 + pavgw m0, m3 + pavgw m1, m4 + vpermt2b m0, m15, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + vbroadcasti32x8 m9, [tlq+hq+1] + movshdup m10, [smooth_shuf] + mova m12, [smooth_weights+32*2] + vpbroadcastd ym5, [pb_1] + mova m15, [smooth_endB] + punpcklbw m8, m9, m0 + punpckhbw m9, m0 + pmaddubsw m13, m8, m7 + pmaddubsw m14, m9, m7 + vshufi32x4 m11, m12, m12, q2020 + vshufi32x4 m12, m12, q3131 + paddw m0, m1, m8 + paddw m1, m9 + paddw m13, m0 + paddw m14, m1 +.w32_loop: + vpbroadcastd m0, [tlq+hq-2] + vpbroadcastd m1, [v_weightsq] + add v_weightsq, 4 + mova m4, m6 + vpshufb m4{k1}, m0, m5 + pmaddubsw m2, m4, m7 + pshufb m1, m10 + pmaddubsw m0, m8, m1 + pmaddubsw m1, m9, m1 + paddw m2, m4 + pmaddubsw m3, m4, m11 + pmaddubsw m4, m12 + paddw m0, m13 + paddw m1, m14 + paddw m3, m2 + paddw m4, m2 + pavgw m0, m3 + pavgw m1, m4 + vpermt2b m0, m15, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + movu m9, [tlq+hq+1] + mova m11, [smooth_weights+64*2] + mova m2, [smooth_weights+64*3] + mova m14, [smooth_endB] + punpcklbw m8, m9, m0 + punpckhbw m9, m0 + pmaddubsw m12, m8, m7 + pmaddubsw m13, m9, m7 + vshufi32x4 m10, m11, m2, q2020 + vshufi32x4 m11, m2, q3131 + paddw m0, m1, m8 + paddw m1, m9 + paddw m12, m0 + paddw m13, m1 +.w64_loop: + mova m4, m6 + vpbroadcastb m4{k1}, [tlq+hq-1] + vpbroadcastw m1, [v_weightsq] + add v_weightsq, 2 + pmaddubsw m2, m4, m7 + pmaddubsw m0, m8, m1 + pmaddubsw m1, m9, m1 + paddw m2, m4 + pmaddubsw m3, m4, m10 + pmaddubsw m4, m11 + paddw m0, m12 + paddw m1, m13 + paddw m3, m2 + paddw m4, m2 + pavgw m0, m3 + pavgw m1, m4 + vpermt2b m0, m14, m1 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + +cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3 + lea r6, [pal_pred_8bpc_avx512icl_table] + tzcnt wd, wm + vbroadcasti32x4 m4, [palq] + movifnidn hd, hm + movsxd wq, [r6+wq*4] + packuswb m4, m4 + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.w4: + pshufb xmm0, xm4, [idxq] + add idxq, 16 + movd [dstq+strideq*0], xmm0 + pextrd [dstq+strideq*1], xmm0, 1 + pextrd [dstq+strideq*2], xmm0, 2 + pextrd [dstq+stride3q ], xmm0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + pshufb xmm0, xm4, [idxq+16*0] + pshufb xmm1, xm4, [idxq+16*1] + add idxq, 16*2 + movq [dstq+strideq*0], xmm0 + movhps [dstq+strideq*1], xmm0 + movq [dstq+strideq*2], xmm1 + movhps [dstq+stride3q ], xmm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +.w16: + pshufb m0, m4, [idxq] + add idxq, 64 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +.w32: + pshufb m0, m4, [idxq+64*0] + pshufb m1, m4, [idxq+64*1] + add idxq, 64*2 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w32 + RET +.w64: + pshufb m0, m4, [idxq+64*0] + pshufb m1, m4, [idxq+64*1] + pshufb m2, m4, [idxq+64*2] + pshufb m3, m4, [idxq+64*3] + add idxq, 64*4 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w64 + RET + +; The ipred_filter code processes 4x2 blocks in the following order +; which increases parallelism compared to doing things row by row. +; Some redundant blocks are calculated for w > 4. +; w4 w8 w16 w32 +; 1 1 2 1 2 3 4 1 2 3 4 9 a b c +; 2 2 3 2 3 4 5 2 3 4 5 a b c d +; 3 3 4 3 4 5 6 3 4 5 6 b c d e +; 4 4 5 4 5 6 7 4 5 6 7 c d e f +; 5 5 6 5 6 7 8 5 6 7 8 d e f g +; 6 6 7 6 7 8 9 6 7 8 9 e f g h +; 7 7 8 7 8 9 a 7 8 9 a f g h i +; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___ +; 9 9 a b h i j +; a b i j +; b j + +cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt +%define base r6-filter_taps + lea r6, [filter_taps] +%ifidn fltd, fltm + movzx fltd, fltb +%else + movzx fltd, byte fltm +%endif + vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0 + movifnidn hd, hm + shl fltd, 6 + vpbroadcastd m6, [base+pd_8] + vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __ + vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4 + vbroadcasti32x4 m8, [r6+fltq+16*1] + vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __ + vbroadcasti32x4 m10, [r6+fltq+16*3] + mova xmm0, xm6 + vpdpbusd xmm0, xmm2, xm7 + mova xmm1, xm6 + vpdpbusd xmm1, xmm2, xm8 + vpdpbusd xmm0, xmm3, xm9 + vpdpbusd xmm1, xmm3, xm10 + packssdw xmm0, xmm1 + cmp wd, 8 + jb .w4 + vpbroadcastd ym2, [tlq+5] + mova m11, [base+filter_perm] + mov r5, 0xffffffffffff000f + psrldq xmm2, 1 ; __ t0 + kmovq k1, r5 ; 0x000f + psraw xm5, xmm0, 4 + packuswb xmm2, xm5 ; __ t0 a0 b0 + pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1 + je .w8 + kxnorb k3, k3, k3 ; 0x00ff + vpbroadcastd xm3, [tlq-4] + kandnq k2, k3, k1 ; 0xffffffffffff0000 + vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __ + mova ym0, ym6 + vpdpbusd ym0, ym2, ym7 + mova ym1, ym6 + vpdpbusd ym1, ym2, ym8 + pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0 + vpbroadcastd m2, [tlq+9] + vpdpbusd ym0, ym3, ym9 + vpdpbusd ym1, ym3, ym10 + vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __ + kunpckbw k4, k1, k3 ; 0x0fff + packssdw ym0, ym1 + psraw ym0, 4 ; a0 d0 a1 b1 + packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1 + pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2 + vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __ + mova m4, m6 + vpdpbusd m4, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + psrldq m0, m2, 1 ; __ d0 __ b0 __ t0 + vpbroadcastd m2, [tlq+13] + vpdpbusd m4, m3, m9 + vpdpbusd m1, m3, m10 + mova m12, [base+filter_end] + lea r5d, [hq-6] + mov r6, dstq + cmovp hd, r5d ; w == 16 ? h : h - 6 + packssdw m4, m1 + psraw m4, 4 ; e0 f0 c1 d1 a2 b2 + packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2 + pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3 +.w16_loop: + vpbroadcastd xm3, [tlq-8] + vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __ + mova m1, m6 + vpdpbusd m1, m2, m7 + mova m0, m6 + vpdpbusd m0, m2, m8 + sub tlq, 2 + vpdpbusd m1, m3, m9 + vpdpbusd m0, m3, m10 + packssdw m1, m0 + mova m0, m4 + psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3 + packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3 + pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 + vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3 + vextracti32x4 [dstq+strideq*0], m5, 2 + vextracti32x4 [dstq+strideq*1], m5, 3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + cmp wd, 16 + je .ret + mova xm13, [filter_perm+16] + mova xmm3, [r6+strideq*0] + punpckhdq xmm3, [r6+strideq*1] + vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 + pinsrb xm3, xmm3, [tlq+r5+16], 7 + pshufb xm3, xm13 + vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __ + mova m0, m6 + vpdpbusd m0, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + kunpckbw k5, k3, k1 ; 0xff0f + lea r3, [strideq*3] + vpdpbusd m0, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m0, m1 + psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3 + packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 + vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 + vpbroadcastd ym2, [tlq+r5+21] + pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3 + vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3 + vextracti32x4 [dstq+strideq*0], m5, 2 + vextracti32x4 [dstq+strideq*1], m5, 3 + punpckhqdq xmm3, [r6+r3] + pinsrb xmm3, [r6+strideq*2+15], 11 + pshufb xm3, xmm3, xm13 + vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __ + mova m4, m6 + vpdpbusd m4, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + kxnord k3, k3, k4 ; 0xfffff0ff + lea r4, [strideq*5] + vpdpbusd m4, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m4, m1 + psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3 + packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3 + vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3 + vpbroadcastd m2, [tlq+r5+25] + pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3 + vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3 + vextracti32x4 [dstq+strideq*2], m5, 2 + vextracti32x4 [dstq+r3 ], m5, 3 + punpckhqdq xmm3, [r6+r4] + pinsrb xmm3, [r6+strideq*4+15], 11 + pshufb xm3, xmm3, xm13 + vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __ + mova m0, m6 + vpdpbusd m0, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + kunpckwd k1, k1, k2 ; 0x000f0000 + vpdpbusd m0, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m0, m1 + psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3 + packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3 + vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3 + vpbroadcastd m2, [tlq+r5+29] + pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7 + vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3 + vextracti32x4 [dstq+strideq*4], m5, 2 + vextracti32x4 [dstq+r4 ], m5, 3 + lea r0, [strideq+r3*2] +.w32_loop: + punpckhqdq xmm3, [r6+r0] + pinsrb xmm3, [r6+r3*2+15], 11 + pshufb xm3, xmm3, xm13 + vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __ +.w32_loop_tail: + mova m4, m6 + vpdpbusd m4, m2, m7 + mova m1, m6 + vpdpbusd m1, m2, m8 + vpdpbusd m4, m3, m9 + vpdpbusd m1, m3, m10 + packssdw m4, m1 + mova m1, m0 + psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7 + packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7 + pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7 + vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7 + vextracti32x4 [r6+strideq*0+16], m5, 2 + vextracti32x4 [r6+strideq*1+16], m5, 3 + lea r6, [r6+strideq*2] + sub r5d, 2 + jg .w32_loop + vpermb m3, m11, m1 + cmp r5d, -6 + jg .w32_loop_tail +.ret: + RET +.w8: + vpermb ym3, ym11, ymm2 +.w8_loop: + vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __ + mova ym0, ym6 + vpdpbusd ym0, ym2, ym7 + mova ym1, ym6 + vpdpbusd ym1, ym2, ym8 + sub tlq, 2 + vpdpbusd ym0, ym3, ym9 + vpdpbusd ym1, ym3, ym10 + mova ym3, ym5 + packssdw ym0, ym1 + psraw ym5, ym0, 4 ; c0 d0 a1 b1 + packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1 + pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 + vpermb ym3, ym11, ym3 ; a0 a1 b0 b1 + movq [dstq+strideq*0], xm3 + movhps [dstq+strideq*1], xm3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +.w4_loop: + vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __ + mova xmm0, xm6 + vpdpbusd xmm0, xmm2, xm7 + mova xmm1, xm6 + vpdpbusd xmm1, xmm2, xm8 + sub tlq, 2 + vpdpbusd xmm0, xmm3, xm9 + vpdpbusd xmm1, xmm3, xm10 + packssdw xmm0, xmm1 +.w4: + psraw xmm0, 4 ; a0 b0 + packuswb xmm0, xmm0 + movd [dstq+strideq*0], xmm0 + pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0 + movd [dstq+strideq*1], xmm2 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_loop + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/ipred_sse.asm b/third_party/dav1d/src/x86/ipred_sse.asm new file mode 100644 index 0000000000..67e90b79ae --- /dev/null +++ b/third_party/dav1d/src/x86/ipred_sse.asm @@ -0,0 +1,5409 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%macro SMOOTH_WEIGHT_TABLE 1-* + %rep %0 + db %1-128, 127-%1 + %rotate 1 + %endrep +%endmacro + +; sm_weights[], but modified to precalculate x and 256-x with offsets to +; enable efficient use of pmaddubsw (which requires signed values) +smooth_weights: SMOOTH_WEIGHT_TABLE \ + 0, 0, 255, 128, 255, 149, 85, 64, \ + 255, 197, 146, 105, 73, 50, 37, 32, \ + 255, 225, 196, 170, 145, 123, 102, 84, \ + 68, 54, 43, 33, 26, 20, 17, 16, \ + 255, 240, 225, 210, 196, 182, 169, 157, \ + 145, 133, 122, 111, 101, 92, 83, 74, \ + 66, 59, 52, 45, 39, 34, 29, 25, \ + 21, 17, 14, 12, 10, 9, 8, 8, \ + 255, 248, 240, 233, 225, 218, 210, 203, \ + 196, 189, 182, 176, 169, 163, 156, 150, \ + 144, 138, 133, 127, 121, 116, 111, 106, \ + 101, 96, 91, 86, 82, 77, 73, 69, \ + 65, 61, 57, 54, 50, 47, 44, 41, \ + 38, 35, 32, 29, 27, 25, 22, 20, \ + 18, 16, 15, 13, 12, 10, 9, 8, \ + 7, 6, 6, 5, 5, 4, 4, 4 + +ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 +ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 +ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 +z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 +z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7 +z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 +filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 +filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 +z_filter_wh4: db 7, 7, 19, 7, +z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 +pd_32768: dd 32768 +z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8 +z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 +z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 +z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 +z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 + db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 +z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 +z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 + db 7, 8, 8, 9, 9, 10, 10, 11 +z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64 +z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 +z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 +z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64 +pw_m1to4: dw -1, -2, -3, -4 +z_filter_k: times 4 db 0, 16 + times 4 db 0, 20 + times 4 db 8, 16 + times 4 db 32, 16 + times 4 db 24, 20 + times 4 db 16, 16 + times 4 db 0, 0 + times 4 db 0, 0 +pw_8: times 8 db 8, 0 +pb_3: times 16 db 3 +pb_16: times 16 db 16 +pw_62: times 8 dw 62 +pw_64: times 8 dw 64 +pw_256: times 8 dw 256 +pw_512: times 8 dw 512 +pw_m256: times 8 dw -256 +pb_2: times 8 db 2 +pb_4: times 8 db 4 +pb_8: times 8 db 8 +pb_128: times 8 db 128 +pb_m16: times 8 db -16 +pw_128: times 4 dw 128 +pw_255: times 4 dw 255 +pb_36_m4: times 4 db 36, -4 +pb_127_m127: times 4 db 127, -127 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) +%define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) + +JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 +JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64 +JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ + s4-8*4, s8-8*4, s16-8*4, s32-8*4 +JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 +JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 + +cextern dr_intra_derivative +cextern filter_intra_taps + +SECTION .text + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 + pshuflw m1, m0, %3 ; extend 8 byte for 2 pos + punpcklqdq m1, m1 + mova [dstq + %2], m1 +%if %1 > 16 + mova [dstq + 16 + %2], m1 +%endif +%if %1 > 32 + mova [dstq + 32 + %2], m1 + mova [dstq + 48 + %2], m1 +%endif +%endmacro + +%macro IPRED_H 1 ; width + sub tlq, 4 + movd m0, [tlq] ; get 4 bytes of topleft data + punpcklbw m0, m0 ; extend 2 byte +%if %1 == 4 + pshuflw m1, m0, q2233 + movd [dstq+strideq*0], m1 + psrlq m1, 32 + movd [dstq+strideq*1], m1 + pshuflw m0, m0, q0011 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+stride3q ], m0 + +%elif %1 == 8 + punpcklwd m0, m0 + punpckhdq m1, m0, m0 + punpckldq m0, m0 + movq [dstq+strideq*1], m1 + movhps [dstq+strideq*0], m1 + movq [dstq+stride3q ], m0 + movhps [dstq+strideq*2], m0 +%else + IPRED_SET %1, 0, q3333 + IPRED_SET %1, strideq, q2222 + IPRED_SET %1, strideq*2, q1111 + IPRED_SET %1, stride3q, q0000 +%endif + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w%1 + RET +%endmacro + +INIT_XMM ssse3 +cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 + LEA r5, ipred_h_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq +.w4: + IPRED_H 4 +.w8: + IPRED_H 8 +.w16: + IPRED_H 16 +.w32: + IPRED_H 32 +.w64: + IPRED_H 64 + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_splat_ssse3_table + tzcnt wd, wm + movu m0, [tlq+ 1] + movu m1, [tlq+17] + movu m2, [tlq+33] + movu m3, [tlq+49] + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + movifnidn wd, wm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd m4, r5d + tzcnt r5d, r5d + movd m5, r5d + LEA r5, ipred_dc_ssse3_table + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+20] + pcmpeqd m3, m3 + psrlw m4, 1 ; dc = (width + height) >> 1; + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movd m0, [tlq-4] + pmaddubsw m0, m3 + jmp wq +.w4: + movd m1, [tlq+1] + pmaddubsw m1, m3 + psubw m0, m4 + paddw m0, m1 + pmaddwd m0, m3 + cmp hd, 4 + jg .w4_mul + psrlw m0, 3 ; dc >>= ctz(width + height); + jmp .w4_end +.w4_mul: + punpckhqdq m1, m0, m0 + paddw m0, m1 + psrlq m1, m0, 32 + paddw m0, m1 + psrlw m0, 2 + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8 + cmovz r6d, r2d + movd m5, r6d + pmulhuw m0, m5 +.w4_end: + pxor m1, m1 + pshufb m0, m1 +.s4: + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +ALIGN function_align +.h8: + movq m0, [tlq-8] + pmaddubsw m0, m3 + jmp wq +.w8: + movq m1, [tlq+1] + pmaddubsw m1, m3 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + paddw m0, m1 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w8_end: + pxor m1, m1 + pshufb m0, m1 +.s8: + movq [dstq+strideq*0], m0 + movq [dstq+strideq*1], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +ALIGN function_align +.h16: + mova m0, [tlq-16] + pmaddubsw m0, m3 + jmp wq +.w16: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8|32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w16_end: + pxor m1, m1 + pshufb m0, m1 +.s16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + mova m2, [tlq-16] + pmaddubsw m2, m3 + paddw m0, m2 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + movu m2, [tlq+17] + pmaddubsw m2, m3 + paddw m1, m2 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 64|16 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w32_end: + pxor m1, m1 + pshufb m0, m1 + mova m1, m0 +.s32: + mova [dstq], m0 + mova [dstq+16], m1 + mova [dstq+strideq], m0 + mova [dstq+strideq+16], m1 + mova [dstq+strideq*2], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q], m0 + mova [dstq+stride3q+16], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +ALIGN function_align +.h64: + mova m0, [tlq-64] + mova m1, [tlq-48] + pmaddubsw m0, m3 + pmaddubsw m1, m3 + paddw m0, m1 + mova m1, [tlq-32] + pmaddubsw m1, m3 + paddw m0, m1 + mova m1, [tlq-16] + pmaddubsw m1, m3 + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 1] + movu m2, [tlq+17] + pmaddubsw m1, m3 + pmaddubsw m2, m3 + paddw m1, m2 + movu m2, [tlq+33] + pmaddubsw m2, m3 + paddw m1, m2 + movu m2, [tlq+49] + pmaddubsw m2, m3 + paddw m1, m2 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 64 + je .w64_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w64_end: + pxor m1, m1 + pshufb m0, m1 + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s64: + mova [dstq], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + mova [dstq+strideq], m0 + mova [dstq+strideq+16], m1 + mova [dstq+strideq+32], m2 + mova [dstq+strideq+48], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s64 + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_left_ssse3_table + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] + movd m2, r6d + psrld m3, m2 + movsxd r6, [r5+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + movu m1, [tlq+48] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 + movu m1, [tlq+32] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h32: + movu m1, [tlq+16] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h16: + pshufd m1, m0, q3232 ; psrlq m1, m0, 16 + paddw m0, m1 +.h8: + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 +.h4: + pmaddwd m0, m2 + pmulhrsw m0, m3 + lea stride3q, [strideq*3] + pxor m1, m1 + pshufb m0, m1 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp wq + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + LEA r5, ipred_dc_splat_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] + mova m1, m0 + mova m2, m0 + mova m3, m0 + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h + LEA r5, ipred_dc_left_ssse3_table + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] + movd m2, wd + psrld m3, m2 + movsxd r6, [r5+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, r5 + add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] + ; w * a = (w - 128) * a + 128 * a + ; (256 - w) * b = (127 - w) * b + 129 * b + ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] + pmaddubsw m6, m%3, m%1 + pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b + paddw m6, m%5 + paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] + psrlw m6, 8 + psrlw m0, 8 + packuswb m6, m0 +%endmacro + +cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights +%define base r6-ipred_smooth_v_ssse3_table + LEA r6, ipred_smooth_v_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m0, [base+pb_127_m127] + movddup m1, [base+pw_128] + lea weightsq, [base+smooth_weights+hq*4] + neg hq + movd m5, [tlq+hq] + pxor m2, m2 + pshufb m5, m2 + add wq, r6 + jmp wq +.w4: + movd m2, [tlq+1] + punpckldq m2, m2 + punpcklbw m2, m5 ; top, bottom + lea r3, [strideq*3] + mova m4, [base+ipred_v_shuf] + mova m5, m4 + punpckldq m4, m4 + punpckhdq m5, m5 + pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom + paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok + paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 +.w4_loop: + movu m1, [weightsq+hq*2] + pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + movd [dstq+strideq*0], m6 + pshuflw m1, m6, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m6, m6 + movd [dstq+strideq*2], m6 + psrlq m6, 32 + movd [dstq+r3 ], m6 + lea dstq, [dstq+strideq*4] + add hq, 4 + jl .w4_loop + RET +ALIGN function_align +.w8: + movq m2, [tlq+1] + punpcklbw m2, m5 + mova m5, [base+ipred_v_shuf] + lea r3, [strideq*3] + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + pmaddubsw m3, m2, m0 + paddw m1, m2 + paddw m3, m1 ; m3 is output for loop +.w8_loop: + movq m1, [weightsq+hq*2] + pshufb m0, m1, m4 + pshufb m1, m5 + SMOOTH 0, 1, 2, 2, 3, 3 + movq [dstq+strideq*0], m6 + movhps [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + add hq, 2 + jl .w8_loop + RET +ALIGN function_align +.w16: + movu m3, [tlq+1] + punpcklbw m2, m3, m5 + punpckhbw m3, m5 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 ; m4 and m5 is output for loop +.w16_loop: + movd m1, [weightsq+hq*2] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m6 + add dstq, strideq + add hq, 1 + jl .w16_loop + RET +ALIGN function_align +.w32: +%if WIN64 + movaps [rsp+24], xmm7 + %define xmm_regs_used 8 +%endif + mova m7, m5 +.w32_loop_init: + mov r3d, 2 +.w32_loop: + movddup m0, [base+pb_127_m127] + movddup m1, [base+pw_128] + movu m3, [tlq+1] + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 + movd m1, [weightsq+hq*2] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m6 + add tlq, 16 + add dstq, 16 + dec r3d + jg .w32_loop + lea dstq, [dstq-32+strideq] + sub tlq, 32 + add hq, 1 + jl .w32_loop_init + RET +ALIGN function_align +.w64: +%if WIN64 + movaps [rsp+24], xmm7 + %define xmm_regs_used 8 +%endif + mova m7, m5 +.w64_loop_init: + mov r3d, 4 +.w64_loop: + movddup m0, [base+pb_127_m127] + movddup m1, [base+pw_128] + movu m3, [tlq+1] + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + pmaddubsw m4, m2, m0 + pmaddubsw m5, m3, m0 + paddw m0, m1, m2 + paddw m1, m3 + paddw m4, m0 + paddw m5, m1 + movd m1, [weightsq+hq*2] + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + SMOOTH 1, 1, 2, 3, 4, 5 + mova [dstq], m6 + add tlq, 16 + add dstq, 16 + dec r3d + jg .w64_loop + lea dstq, [dstq-64+strideq] + sub tlq, 64 + add hq, 1 + jl .w64_loop_init + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h +%define base r6-ipred_smooth_h_ssse3_table + LEA r6, ipred_smooth_h_ssse3_table + mov wd, wm + movd m3, [tlq+wq] + pxor m1, m1 + pshufb m3, m1 ; right + tzcnt wd, wd + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m4, [base+pb_127_m127] + movddup m5, [base+pw_128] + add wq, r6 + jmp wq +.w4: + movddup m6, [base+smooth_weights+4*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 4 + sub tlq, hq + lea r3, [strideq*3] +.w4_loop: + movd m2, [tlq+hq] ; left + pshufb m2, m7 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r3 ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + mova m6, [base+smooth_weights+8*2] + mova m7, [base+ipred_h_shuf] + sub tlq, 4 + sub tlq, hq + punpckldq m7, m7 +.w8_loop: + movd m2, [tlq+hq] ; left + pshufb m2, m7 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m6 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + mova m6, [base+smooth_weights+16*2] + mova m7, [base+smooth_weights+16*3] + sub tlq, 1 + sub tlq, hq +.w16_loop: + pxor m1, m1 + movd m2, [tlq+hq] ; left + pshufb m2, m1 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m6 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + pmaddubsw m2, m7 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq], m0 + lea dstq, [dstq+strideq] + sub hd, 1 + jg .w16_loop + RET +ALIGN function_align +.w32: + sub tlq, 1 + sub tlq, hq + pxor m6, m6 +.w32_loop_init: + mov r5, 2 + lea r3, [base+smooth_weights+16*4] +.w32_loop: + mova m7, [r3] + add r3, 16 + movd m2, [tlq+hq] ; left + pshufb m2, m6 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m7 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + mova m7, [r3] + add r3, 16 + pmaddubsw m2, m7 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq], m0 + add dstq, 16 + dec r5 + jg .w32_loop + lea dstq, [dstq-32+strideq] + sub hd, 1 + jg .w32_loop_init + RET +ALIGN function_align +.w64: + sub tlq, 1 + sub tlq, hq + pxor m6, m6 +.w64_loop_init: + mov r5, 4 + lea r3, [base+smooth_weights+16*8] +.w64_loop: + mova m7, [r3] + add r3, 16 + movd m2, [tlq+hq] ; left + pshufb m2, m6 + punpcklbw m1, m2, m3 ; left, right + punpckhbw m2, m3 + pmaddubsw m0, m1, m4 ; 127 * left - 127 * right + paddw m0, m1 ; 128 * left + 129 * right + pmaddubsw m1, m7 + paddw m1, m5 + paddw m0, m1 + pmaddubsw m1, m2, m4 + paddw m1, m2 + mova m7, [r3] + add r3, 16 + pmaddubsw m2, m7 + paddw m2, m5 + paddw m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq], m0 + add dstq, 16 + dec r5 + jg .w64_loop + lea dstq, [dstq-64+strideq] + sub hd, 1 + jg .w64_loop_init + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int a); +;--------------------------------------------------------------------------------------- +%macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 + pmaddubsw m6, m%3, m%1 + mova m0, m6 + pmaddubsw m6, m%4, m%2 + mova m1, m6 +%ifnum %5 + paddw m0, m%5 +%else + paddw m0, %5 +%endif +%ifnum %6 + paddw m1, m%6 +%else + paddw m1, %6 +%endif +%ifnum %7 +%else + mova m3, %7 +%endif + pavgw m0, m2 + pavgw m1, m3 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 +%endmacro + +%macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] + mova m1, [rsp+16*%1] ; top + punpckhbw m6, m1, m0 ; top, bottom + punpcklbw m1, m0 ; top, bottom + pmaddubsw m2, m1, m5 + mova [rsp+16*%2], m1 + paddw m1, m3 ; 1 * top + 255 * bottom + 255 + paddw m2, m1 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*%3], m2 + pmaddubsw m2, m6, m5 + mova [rsp+16*%4], m6 + paddw m6, m3 ; 1 * top + 255 * bottom + 255 + paddw m2, m6 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*%5], m2 + movd m1, [tlq+hq] ; left + pshufb m1, [base+pb_3] ; topleft[-(1 + y)] + punpcklbw m1, m4 ; left, right + pmaddubsw m2, m1, m5 ; 127 * left - 127 * right + paddw m2, m1 ; 128 * left + 129 * right + mova m3, m2 + pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; + pmaddubsw m1, %7 + paddw m2, m3, m0 + paddw m3, m1 + movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + mova m7, [rsp+16*%9] + pshufb m1, m7 + mova [rsp+16*%8], m3 + mova m4, [rsp+16*%2] + mova m5, [rsp+16*%3] + mova m3, [rsp+16*%4] + mova m7, [rsp+16*%5] + SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] + mova [dstq], m0 + movddup m3, [base+pw_255] ; recovery + mova m0, [rsp+16*%10] ; recovery + mova m4, [rsp+16*%11] ; recovery + mova m5, [rsp+16*%12] ; recovery +%endmacro + +cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights +%define base r6-ipred_smooth_ssse3_table + mov wd, wm + mov hd, hm + LEA r6, ipred_smooth_ssse3_table + movd m4, [tlq+wq] ; right + pxor m2, m2 + pshufb m4, m2 + tzcnt wd, wd + mov r5, tlq + sub r5, hq + movsxd wq, [r6+wq*4] + movddup m5, [base+pb_127_m127] + movd m0, [r5] + pshufb m0, m2 ; bottom + movddup m3, [base+pw_255] + add wq, r6 + lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] + jmp wq +.w4: + mova m7, [base+ipred_v_shuf] + movd m1, [tlq+1] ; left + pshufd m1, m1, q0000 + sub tlq, 4 + lea r3, [strideq*3] + sub tlq, hq + punpcklbw m1, m0 ; top, bottom + pshufd m6, m7, q1100 + pshufd m7, m7, q3322 + pmaddubsw m2, m1, m5 + paddw m3, m1 ; 1 * top + 255 * bottom + 255 + paddw m2, m3 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; + punpcklqdq m1, m1 + mova [rsp+16*2], m1 + mova [rsp+16*3], m4 + mova [rsp+16*4], m6 + mova [rsp+16*5], m5 +.w4_loop: + movd m1, [tlq+hq] ; left + pshufb m1, [base+ipred_h_shuf] + punpcklbw m0, m1, m4 ; left, right + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 ; 127 * left - 127 * right + pmaddubsw m3, m1, m5 + paddw m2, m0 ; 128 * left + 129 * right + paddw m3, m1 + mova m4, [rsp+16*2] + pmaddubsw m0, m4 + pmaddubsw m1, m4 + paddw m2, m0 + paddw m3, m1 + movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + add v_weightsq, 8 + pshufb m0, m1, m6 + pshufb m1, m7 + mova m4, [rsp+16*0] + mova m5, [rsp+16*1] + SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 + mova m4, [rsp+16*3] + mova m6, [rsp+16*4] + mova m5, [rsp+16*5] + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r3 ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + mova m7, [base+ipred_v_shuf] + movq m1, [tlq+1] ; left + punpcklqdq m1, m1 + sub tlq, 4 + sub tlq, hq + punpcklbw m1, m0 + pshufd m6, m7, q0000 + pshufd m7, m7, q1111 + pmaddubsw m2, m1, m5 + paddw m3, m1 + paddw m2, m3 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; + mova [rsp+16*2], m1 + mova [rsp+16*3], m4 + mova [rsp+16*4], m6 + mova [rsp+16*5], m5 +.w8_loop: + movd m1, [tlq+hq] ; left + pshufb m1, [base+ipred_h_shuf] + pshufd m1, m1, q1100 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + pmaddubsw m2, m0, m5 + pmaddubsw m3, m1, m5 + paddw m2, m0 + paddw m3, m1 + mova m4, [rsp+16*2] + pmaddubsw m0, m4 + pmaddubsw m1, m4 + paddw m2, m0 + paddw m3, m1 + movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + add v_weightsq, 4 + pshufb m0, m1, m6 + pshufb m1, m7 + mova m4, [rsp+16*0] + mova m5, [rsp+16*1] + SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 + mova m4, [rsp+16*3] + mova m6, [rsp+16*4] + mova m5, [rsp+16*5] + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + mova m7, [base+ipred_v_shuf] + movu m1, [tlq+1] ; left + sub tlq, 4 + sub tlq, hq + punpckhbw m6, m1, m0 ; top, bottom + punpcklbw m1, m0 ; top, bottom + pshufd m7, m7, q0000 + mova [rsp+16*2], m7 + pmaddubsw m2, m6, m5 + mova [rsp+16*5], m6 + paddw m6, m3 ; 1 * top + 255 * bottom + 255 + paddw m2, m6 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*6], m2 + pmaddubsw m2, m1, m5 + paddw m3, m1 ; 1 * top + 255 * bottom + 255 + mova [rsp+16*0], m1 + paddw m2, m3 ; 128 * top + 129 * bottom + 255 + mova [rsp+16*1], m2 + mova [rsp+16*3], m4 + mova [rsp+16*4], m5 +.w16_loop: + movd m1, [tlq+hq] ; left + pshufb m1, [base+pb_3] ; topleft[-(1 + y)] + punpcklbw m1, m4 ; left, right + pmaddubsw m2, m1, m5 ; 127 * left - 127 * right + paddw m2, m1 ; 128 * left + 129 * right + mova m0, m1 + mova m3, m2 + pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; + pmaddubsw m1, [base+smooth_weights+16*3] + paddw m2, m0 + paddw m3, m1 + movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; + add v_weightsq, 2 + mova m7, [rsp+16*2] + pshufb m1, m7 + mova [rsp+16*7], m3 + mova m4, [rsp+16*0] + mova m5, [rsp+16*1] + mova m3, [rsp+16*5] + mova m7, [rsp+16*6] + SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] + mova m4, [rsp+16*3] + mova m5, [rsp+16*4] + mova [dstq], m0 + lea dstq, [dstq+strideq] + sub hd, 1 + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m1, [tlq+1] ; top topleft[1 + x] + movu m2, [tlq+17] ; top + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + sub tlq, 4 + sub tlq, hq + mova m7, [base+ipred_v_shuf] + pshufd m7, m7, q0000 + mova [rsp+16*2], m7 + mova [rsp+16*3], m0 + mova [rsp+16*4], m4 + mova [rsp+16*5], m5 +.w32_loop: + SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 + lea dstq, [dstq-16+strideq] + add v_weightsq, 2 + sub hd, 1 + jg .w32_loop + RET +ALIGN function_align +.w64: + movu m1, [tlq+1] ; top topleft[1 + x] + movu m2, [tlq+17] ; top + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + movu m1, [tlq+33] ; top + movu m2, [tlq+49] ; top + mova [rsp+16*11], m1 + mova [rsp+16*12], m2 + sub tlq, 4 + sub tlq, hq + mova m7, [base+ipred_v_shuf] + pshufd m7, m7, q0000 + mova [rsp+16*2], m7 + mova [rsp+16*3], m0 + mova [rsp+16*4], m4 + mova [rsp+16*5], m5 +.w64_loop: + SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 + add dstq, 16 + SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 + lea dstq, [dstq-48+strideq] + add v_weightsq, 2 + sub hd, 1 + jg .w64_loop + RET + +%if ARCH_X86_64 +cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx + %define base r7-$$ + lea r7, [$$] + mova m8, [base+pw_62] + mova m9, [base+pw_64] + mova m10, [base+pw_512] +%else +cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx + %define base r1-$$ + %define m8 [base+pw_62] + %define m9 [base+pw_64] + %define m10 [base+pw_512] + %define strideq r3 + %define stridemp dword [rsp+16*12] + mov stridemp, r1 + LEA r1, $$ +%endif + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + inc tlq + movsxd wq, [base+ipred_z1_ssse3_table+wq*4] + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + lea wq, [base+wq+ipred_z1_ssse3_table] + movzx dxd, word [base+dr_intra_derivative+dxq] + xor angled, 0x4ff ; d = 90 - angle + jmp wq +.w4: + lea r3d, [angleq+88] + test r3d, 0x480 + jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 + sar r3d, 9 + add r3d, hd + cmp r3d, 8 + jg .w4_no_upsample ; h > 8 || (w == h && is_sm) + mova m1, [tlq-1] + pshufb m0, m1, [base+z_upsample1] + pshufb m1, [base+z_upsample2] + movddup m2, [base+pb_36_m4] + add dxd, dxd + pmaddubsw m0, m2 + pshufd m7, m1, q3333 + movd [rsp+16], m7 ; top[max_base_x] + pmaddubsw m1, m2 + movd m6, dxd + mov r5d, dxd ; xpos + pshufb m6, [base+pw_256] + paddw m1, m0 + movq m0, [tlq] + pmulhrsw m1, m10 + paddw m7, m6, m6 + punpcklqdq m6, m7 ; xpos0 xpos1 + packuswb m1, m1 + punpcklbw m0, m1 + movifnidn strideq, stridemp + mova [rsp], m0 +.w4_upsample_loop: + lea r2d, [r5+dxq] + shr r5d, 6 ; base0 + movq m0, [rsp+r5] + lea r5d, [r2+dxq] + shr r2d, 6 ; base1 + movhps m0, [rsp+r2] + pand m2, m8, m6 ; frac + psubw m1, m9, m2 ; 64-frac + psllw m2, 8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + paddw m6, m7 ; xpos += dx + pmulhrsw m0, m10 + packuswb m0, m0 + movd [dstq+strideq*0], m0 + pshuflw m0, m0, q1032 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_upsample_loop + RET +.w4_no_upsample: + mov r3d, 7 ; max_base + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + lea r3d, [hq+3] + movd m0, r3d + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + pcmpeqb m1, m0, [base+z_filter_wh4] + pand m1, m2 + pcmpgtb m1, [base+z_filter_t_w48+angleq*8] + pmovmskb r5d, m1 + mov r3d, 7 + test r5d, r5d + jz .w4_main ; filter_strength == 0 + mova m3, [tlq-1] + imul r5d, 0x55555555 + movu m7, [base+z_filter_s+8] + shr r5d, 30 ; filter_strength + movddup m0, [base+pb_8] + pminub m7, m0 + pshufb m0, m3, [base+z_filter_s] + movddup m4, [base+z_filter_k-8+r5*8+24*0] + pshufb m3, m7 + movddup m5, [base+z_filter_k-8+r5*8+24*1] + shufps m2, m0, m3, q2121 + movddup m6, [base+z_filter_k-8+r5*8+24*2] + pmaddubsw m0, m4 + pmaddubsw m1, m2, m4 + pmaddubsw m2, m5 + paddd m5, m6 + pmaddubsw m4, m3, m5 + pmaddubsw m3, m6 + paddw m0, m2 + paddw m1, m4 + paddw m0, m3 + pshufd m1, m1, q3333 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + mov r5d, 9 + mov tlq, rsp + cmp hd, 4 + cmovne r3d, r5d + packuswb m0, m1 + mova [tlq], m0 +.w4_main: + add tlq, r3 + movd m5, dxd + movddup m0, [base+z_base_inc] ; base_inc << 6 + movd m7, [tlq] ; top[max_base_x] + shl r3d, 6 + movd m4, r3d + pshufb m5, [base+pw_256] + mov r5d, dxd ; xpos + pshufb m7, [base+pw_m256] + sub r5, r3 + pshufb m4, [base+pw_256] + mova m3, [base+z1_shuf_w4] + paddw m6, m5, m5 + psubw m4, m0 ; max_base_x + punpcklqdq m5, m6 ; xpos0 xpos1 +.w4_loop: + lea r3, [r5+dxq] + sar r5, 6 ; base0 + movq m0, [tlq+r5] + lea r5, [r3+dxq] + sar r3, 6 ; base1 + movhps m0, [tlq+r3] + pand m2, m8, m5 ; frac + psubw m1, m9, m2 ; 64-frac + psllw m2, 8 + pshufb m0, m3 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + movifnidn strideq, stridemp + pcmpgtw m1, m4, m5 ; base < max_base_x + pmulhrsw m0, m10 + paddw m5, m6 ; xpos += dx + pand m0, m1 + pandn m1, m7 + por m0, m1 + packuswb m0, m0 + movd [dstq+strideq*0], m0 + pshuflw m0, m0, q1032 + movd [dstq+strideq*1], m0 + sub hd, 2 + jz .w4_end + lea dstq, [dstq+strideq*2] + test r5d, r5d + jl .w4_loop + packuswb m7, m7 +.w4_end_loop: + movd [dstq+strideq*0], m7 + movd [dstq+strideq*1], m7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_end_loop +.w4_end: + RET +.w8: + lea r3d, [angleq+88] + and r3d, ~0x7f + or r3d, hd + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + mova m5, [base+z_upsample1] + movu m3, [base+z_filter_s+6] + movd m4, hd + mova m0, [tlq-1] + movu m1, [tlq+7] + pxor m7, m7 + pshufb m4, m7 + movddup m7, [base+pb_36_m4] + pminub m4, m3 + add dxd, dxd + pshufb m2, m0, m5 + pmaddubsw m2, m7 + pshufb m0, m3 + pmaddubsw m0, m7 + movd m6, dxd + pshufb m3, m1, m5 + pmaddubsw m3, m7 + pshufb m1, m4 + pmaddubsw m1, m7 + pshufb m6, [base+pw_256] + mov r5d, dxd + paddw m2, m0 + paddw m7, m6, m6 + paddw m3, m1 + punpcklqdq m6, m7 ; xpos0 xpos1 + movu m1, [tlq] + pmulhrsw m2, m10 + pmulhrsw m3, m10 + packuswb m2, m3 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + movifnidn strideq, stridemp + mova [rsp+16*0], m0 + mova [rsp+16*1], m1 +.w8_upsample_loop: + lea r2d, [r5+dxq] + shr r5d, 6 ; base0 + movu m0, [rsp+r5] + lea r5d, [r2+dxq] + shr r2d, 6 ; base1 + movu m1, [rsp+r2] + pand m2, m8, m6 + psubw m3, m9, m2 + psllw m2, 8 + por m3, m2 + punpcklqdq m2, m3, m3 ; frac0 + pmaddubsw m0, m2 + punpckhqdq m3, m3 ; frac1 + pmaddubsw m1, m3 + paddw m6, m7 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_upsample_loop + RET +.w8_no_upsample: + lea r3d, [hq+7] + movd m0, r3d + and r3d, 7 + or r3d, 8 ; imin(h+7, 15) + test angled, 0x400 + jnz .w8_main + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + movu m1, [base+z_filter_wh8] + psrldq m3, [base+z_filter_t_w48+angleq*8], 4 + pcmpeqb m1, m0 + pand m1, m2 + pcmpgtb m1, m3 + pmovmskb r5d, m1 + test r5d, r5d + jz .w8_main ; filter_strength == 0 + movd m3, [tlq-1] + movu m0, [tlq+16*0] + imul r5d, 0x55555555 + movu m1, [tlq+16*1] + shr r5d, 30 ; filter_strength + movd m2, [tlq+r3] + lea tlq, [rsp+16*4] + sub r5, 3 + mova [tlq-16*1], m0 + pxor m7, m7 + mova [tlq+16*0], m1 + pshufb m3, m7 + pshufb m2, m7 + mova [tlq-16*2], m3 + movq [tlq+r3-15], m2 + call .filter_edge + sar r5d, 1 + add r5d, 17 + cmp hd, 8 + cmova r3d, r5d +.w8_main: + add tlq, r3 + movd m5, dxd + movd m7, [tlq] + shl r3d, 6 + movu m3, [base+z_filter_s+2] + movd m4, r3d + pshufb m5, [base+pw_256] + mov r5d, dxd + pshufb m7, [base+pw_m256] + sub r5, r3 + pshufb m4, [base+pw_256] + psubw m4, [base+z_base_inc] + mova m6, m5 +.w8_loop: + mov r3, r5 + sar r3, 6 + movu m0, [tlq+r3] + pand m1, m8, m5 + psubw m2, m9, m1 + psllw m1, 8 + pshufb m0, m3 + por m1, m2 + pmaddubsw m0, m1 + pcmpgtw m1, m4, m5 + paddw m5, m6 + pmulhrsw m0, m10 + pand m0, m1 + pandn m1, m7 + por m0, m1 + packuswb m0, m0 + movq [dstq], m0 + dec hd + jz .w8_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w8_loop + packuswb m7, m7 +.w8_end_loop: + movq [dstq], m7 + add dstq, strideq + dec hd + jg .w8_end_loop +.w8_end: + RET +.w16: + lea r3d, [hq+15] + movd m0, r3d + and r3d, 15 + or r3d, 16 ; imin(h+15, 31) + test angled, 0x400 + jnz .w16_main + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + movq m3, [base+z_filter_t_w16+angleq*4] + pcmpeqb m1, m0, [base+z_filter_wh16] + pand m1, m2 + pcmpgtb m1, m3 + pmovmskb r5d, m1 + test r5d, r5d + jz .w16_main ; filter_strength == 0 + movd m4, [tlq-1] + movu m0, [tlq+16*0] + imul r5d, 0x24924924 + movu m1, [tlq+16*1] + shr r5d, 30 + movd m2, [tlq+30] + adc r5, -4 ; filter_strength-3 + movd m3, [tlq+r3] + lea tlq, [rsp+16*4] + mova [tlq-16*1], m0 + pxor m7, m7 + mova [tlq+16*0], m1 + pshufb m4, m7 + movd [rsp], m2 + pshufb m3, m7 + mova [tlq-16*2], m4 + movd [tlq+r3-16], m3 + call .filter_edge + cmp hd, 16 + jle .w16_main + pshuflw m0, [rsp], q0000 + sar r5, 1 + movd m1, [base+z_filter_k_tail+4+r5*4] + lea r3d, [r5+33] + pmaddubsw m0, m1 +%if ARCH_X86_64 + pmulhrsw m0, m10 +%else + pmulhrsw m0, m4 +%endif + packuswb m0, m0 + movd [tlq+32], m0 +.w16_main: + add tlq, r3 + movd m5, dxd + movd m7, [tlq] + movd m4, r3d + shl r3d, 6 + pshufb m5, [base+pw_256] + pxor m6, m6 + pshufb m7, m6 + mov r5d, dxd + pshufb m4, m6 + sub r5, r3 + psubb m4, [base+pb_0to15] + mova m6, m5 +.w16_loop: + mov r3, r5 + sar r3, 6 + movu m1, [tlq+r3+0] + pand m0, m8, m5 + movu m2, [tlq+r3+1] + psubw m3, m9, m0 + psllw m0, 8 + por m3, m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + psrlw m3, m5, 6 + packsswb m3, m3 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + paddw m5, m6 + pcmpgtb m2, m4, m3 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + mova [dstq], m0 + dec hd + jz .w16_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w16_loop +.w16_end_loop: + mova [dstq], m7 + add dstq, strideq + dec hd + jg .w16_end_loop +.w16_end: + RET +.w32: + lea r3d, [hq+31] + and r3d, 31 + or r3d, 32 ; imin(h+31, 63) + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + movd m6, [tlq-1] + movu m0, [tlq+16*0] + movu m1, [tlq+16*1] + movu m2, [tlq+16*2] + movu m3, [tlq+16*3] + movd m4, [tlq+62] + movd m5, [tlq+r3] + lea tlq, [rsp+16*6] + mova [tlq-16*3], m0 + pxor m7, m7 + mova [tlq-16*2], m1 + pshufb m6, m7 + mova [tlq-16*1], m2 + xor r5d, r5d ; filter_strength = 3 + mova [tlq+16*0], m3 + movd [rsp], m4 + pshufb m5, m7 + mova [tlq-16*4], m6 + movd [tlq+r3-48], m5 + call .filter_edge + sub tlq, 16*2 + call .filter_edge + cmp hd, 32 + jle .w32_main + pshuflw m0, [rsp], q0000 + movd m1, [base+z_filter_k_tail+4] + add r3d, 2 + pmaddubsw m0, m1 +%if ARCH_X86_64 + pmulhrsw m0, m10 +%else + pmulhrsw m0, m4 +%endif + packuswb m0, m0 + movd [tlq+64], m0 +.w32_main: + add tlq, r3 + movd m0, r3d + movd m7, [tlq] + shl r3d, 6 + movd m5, dxd + pxor m6, m6 + mov r5d, dxd + pshufb m0, m6 + pshufb m5, [base+pw_256] + sub r5, r3 + pshufb m7, m6 + psubb m0, [base+pb_0to15] + movddup m1, [base+pb_m16] + mova [rsp+16*0], m0 + paddb m0, m1 + mova [rsp+16*1], m0 + mova m6, m5 +.w32_loop: + mov r3, r5 + sar r3, 6 + movu m1, [tlq+r3+16*0+0] + pand m0, m8, m5 + movu m2, [tlq+r3+16*0+1] + psubw m3, m9, m0 + psllw m0, 8 + por m3, m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + psrlw m4, m5, 6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packsswb m4, m4 + pcmpgtb m2, [rsp+16*0], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + movu m1, [tlq+r3+16*1+0] + movu m2, [tlq+r3+16*1+1] + mova [dstq+16*0], m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + paddw m5, m6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pcmpgtb m2, [rsp+16*1], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + mova [dstq+16*1], m0 + dec hd + jz .w32_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w32_loop +.w32_end_loop: + mova [dstq+16*0], m7 + mova [dstq+16*1], m7 + add dstq, strideq + dec hd + jg .w32_end_loop +.w32_end: + RET +.w64: + lea r3d, [hq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + movd m4, [tlq-1] + movu m0, [tlq+16*0] + movu m1, [tlq+16*1] + movu m2, [tlq+16*2] + movu m3, [tlq+16*3] + mova [rsp+16*3], m0 + pxor m7, m7 + mova [rsp+16*4], m1 + pshufb m4, m7 + mova [rsp+16*5], m2 + mova [rsp+16*6], m3 + mova [rsp+16*2], m4 + movu m0, [tlq+16*4] + movu m1, [tlq+16*5] + movu m2, [tlq+16*6] + movu m3, [tlq+16*7] + movd m4, [tlq+r3] + lea tlq, [rsp+16*10] + mova [tlq-16*3], m0 + xor r5d, r5d ; filter_strength = 3 + mova [tlq-16*2], m1 + pshufb m4, m7 + mova [tlq-16*1], m2 + mova [tlq+16*0], m3 + movd [tlq+r3-16*7], m4 + cmp hd, 64 + jl .w64_filter96 ; skip one call if the last 32 bytes aren't used + call .filter_edge +.w64_filter96: + sub tlq, 16*2 + call .filter_edge + sub tlq, 16*2 + call .filter_edge + sub tlq, 16*2 + call .filter_edge +.w64_main: + add tlq, r3 + movd m0, r3d + movd m7, [tlq] + shl r3d, 6 + movd m5, dxd + pxor m6, m6 + mov r5d, dxd + pshufb m0, m6 + sub r5, r3 + pshufb m5, [base+pw_256] + pshufb m7, m6 + psubb m0, [base+pb_0to15] + movddup m1, [base+pb_m16] + mova [rsp+16*0], m0 + paddb m0, m1 + mova [rsp+16*1], m0 + paddb m0, m1 + mova [rsp+16*2], m0 + paddb m0, m1 + mova [rsp+16*3], m0 + mova m6, m5 +.w64_loop: + mov r3, r5 + sar r3, 6 + movu m1, [tlq+r3+16*0+0] + pand m0, m8, m5 + movu m2, [tlq+r3+16*0+1] + psubw m3, m9, m0 + psllw m0, 8 + por m3, m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + psrlw m4, m5, 6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packsswb m4, m4 + pcmpgtb m2, [rsp+16*0], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + movu m1, [tlq+r3+16*1+0] + movu m2, [tlq+r3+16*1+1] + mova [dstq+16*0], m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pcmpgtb m2, [rsp+16*1], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + movu m1, [tlq+r3+16*2+0] + movu m2, [tlq+r3+16*2+1] + mova [dstq+16*1], m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pcmpgtb m2, [rsp+16*2], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + movu m1, [tlq+r3+16*3+0] + movu m2, [tlq+r3+16*3+1] + mova [dstq+16*2], m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + paddw m5, m6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pcmpgtb m2, [rsp+16*3], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + mova [dstq+16*3], m0 + dec hd + jz .w64_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w64_loop +.w64_end_loop: + mova [dstq+16*0], m7 + mova [dstq+16*1], m7 + mova [dstq+16*2], m7 + mova [dstq+16*3], m7 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET +ALIGN function_align +.filter_edge: ; 32 pixels/iteration + movddup m7, [base+z_filter_k+8*2+r5*8+24*0] + movu m2, [tlq-18] + movu m1, [tlq-17] + movu m3, [tlq- 2] + movu m4, [tlq- 1] + punpcklbw m0, m2, m1 + pmaddubsw m0, m7 + punpckhbw m2, m1 + pmaddubsw m2, m7 + punpcklbw m1, m3, m4 + pmaddubsw m1, m7 + punpckhbw m3, m4 + pmaddubsw m3, m7 + movddup m7, [base+z_filter_k+8*2+r5*8+24*1] + mova m5, [tlq-16] + movu m6, [tlq-15] + punpcklbw m4, m5, m6 + pmaddubsw m4, m7 + punpckhbw m5, m6 + pmaddubsw m5, m7 + paddw m0, m4 + paddw m2, m5 + mova m5, [tlq+ 0] + movu m6, [tlq+ 1] + punpcklbw m4, m5, m6 + pmaddubsw m4, m7 + punpckhbw m5, m6 + pmaddubsw m5, m7 + paddw m1, m4 + paddw m3, m5 + test r5d, r5d + jnz .filter_end ; 3-tap + movddup m7, [base+z_filter_k+8*8] + movu m5, [tlq-14] + movu m6, [tlq+ 2] + punpcklbw m4, m5, m5 + pmaddubsw m4, m7 + punpckhbw m5, m5 + pmaddubsw m5, m7 + paddw m0, m4 + paddw m2, m5 + punpcklbw m5, m6, m6 + pmaddubsw m5, m7 + punpckhbw m6, m6 + pmaddubsw m6, m7 + paddw m1, m5 + paddw m3, m6 +.filter_end: +%if ARCH_X86_64 + REPX {pmulhrsw x, m10}, m0, m2, m1, m3 +%else + mova m4, m10 + REPX {pmulhrsw x, m4 }, m0, m2, m1, m3 +%endif + packuswb m0, m2 + packuswb m1, m3 + mova [tlq+16*0], m0 + mova [tlq+16*1], m1 + ret + +%if ARCH_X86_64 +cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy + %define base r7-$$ + %define maxwm r6m + %define maxhm r7m + lea r7, [$$] + mov hd, hm + mova m8, [base+pw_62] + mova m9, [base+pw_64] + lea r9d, [wq-4] + mova m10, [base+pw_512] + shl r9d, 6 + mova m11, [base+z1_shuf_w4] + or r9d, hd + mova m12, [base+z2_h_shuf] +%else +cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx + %define base r1-$$ + %define m8 [base+pw_62] + %define m9 [base+pw_64] + %define m10 [base+pw_512] + %define m11 [rsp+16*16] + %define m12 [rsp+16*17] + %define r8 [rsp+16*6+4*1] + %define r9b byte [rsp+16*18+4*0] + %define r9d dword [rsp+16*18+4*0] + %define r10d dword [rsp+16*18+4*1] + %define r11d dword [rsp+16*18+4*2] + %define maxwm [rsp+16*18+4*3] + %define maxhm [rsp+16*19+4*0] + %define stridemp [rsp+16*19+4*1] + %define strideq r3 + %define dyd r4 + %define dyq r4 + mov stridemp, r1 + mov r1d, r6m + mov r4d, r7m + mov maxwm, r1d + mov maxhm, r4d + LEA r1, $$ + lea hd, [wq-4] + mova m0, [base+z1_shuf_w4] + shl hd, 6 + mova m1, [base+z2_h_shuf] + or hd, hm + mova m11, m0 + mov r9d, hd + mova m12, m1 +%endif + tzcnt wd, wd + movifnidn angled, anglem + movsxd wq, [base+ipred_z2_ssse3_table+wq*4] +%if ARCH_X86_64 + movzx dxd, angleb +%else + movzx dxd, byte anglem +%endif + xor angled, 0x400 + mova m0, [tlq-16*4] + mov dyd, dxd + mova m1, [tlq-16*3] + neg dxq + mova m2, [tlq-16*2] + and dyd, ~1 + mova m3, [tlq-16*1] + and dxq, ~1 + movd m4, [tlq] + movu m5, [tlq+16*0+1] + movu m6, [tlq+16*1+1] + movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 + movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle + mova [rsp+16*2], m0 + pxor m7, m7 + mova [rsp+16*3], m1 + pshufb m4, m7 + mova [rsp+16*4], m2 + lea wq, [base+ipred_z2_ssse3_table+wq] + mova [rsp+16*5], m3 + neg dxd + mova [rsp+16*6], m4 + or dyd, 4<<16 + mova [rsp+16*7], m4 + mova [rsp+16*8], m5 + mova [rsp+16*9], m6 + movq m0, [base+z_base_inc+2] + movsldup m1, [base+z2_dy_offset] + movq m2, [base+pw_256] ; 4<<6 + movq [rsp+16*14+8*0], m0 + movq [rsp+16*15+8*0], m1 + movq [rsp+16*15+8*1], m2 +%if ARCH_X86_64 + lea r10d, [dxq+(128<<6)] ; xpos +%else + mov [rsp+16*7+4*1], dyd + lea r4d, [dxq+(128<<6)] + mov r10d, r4d + movzx hd, r9b +%endif + mov r11d, (128-4)<<6 + jmp wq +.w4: + test angled, 0x400 + jnz .w4_main + movd m5, [tlq+4] + lea r3d, [hq+2] + add angled, 1022 + pshufb m5, m7 + shl r3d, 6 + movd [rsp+16*8+4], m5 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + call .upsample_above + sub angled, 1075 ; angle - 53 + lea r3d, [hq+3] + xor angled, 0x7f ; 180 - angle + movd m0, r3d + movd m6, angled + shr angled, 8 ; is_sm << 1 + pshufb m0, m7 + pshufb m6, m7 + pcmpeqb m0, [base+z_filter_wh4] + pand m6, m0 + pcmpgtb m6, [base+z_filter_t_w48+angleq*8] + jmp .w8_filter_left +.upsample_above: ; w4/w8 + movq m3, [rsp+gprsize+16*8-2] + movq m1, [rsp+gprsize+16*8-1] + movq m0, [rsp+gprsize+16*8+0] + movq m4, [rsp+gprsize+16*8+1] + movddup m5, [base+pb_36_m4] + punpcklbw m1, m3 + punpcklbw m2, m0, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 +%if ARCH_X86_64 + mova m11, [base+pb_0to15] + lea r10d, [r10+dxq+(1<<6)] + mov r11d, (128-7)<<6 +%else + mova m3, [base+pb_0to15] + mov r3d, [rsp+gprsize+16*18+4*1] + mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6 + lea r3d, [r3+dxq+(1<<6)] + mov [rsp+gprsize+16*18+4*1], r3d + mova [rsp+gprsize+16*16], m3 +%endif + add dxd, dxd + paddw m1, m2 + pmulhrsw m1, m10 + movq m2, [rsp+gprsize+16*14] + paddw m2, m2 + movq [rsp+gprsize+16*14], m2 + packuswb m1, m1 + punpcklbw m1, m0 + mova [rsp+gprsize+16*8], m1 + ret +.w4_no_upsample_above: + lea r3d, [hq+3] + mov [rsp], angled + sub angled, 1112 ; angle - 90 + movd m0, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movu m3, [base+z_filter_wh4] + mova m4, [base+z_filter_t_w48+angleq*8] + call .w8_filter_top + mov angled, [rsp] + lea r3d, [hq+2] + sub angled, 139 + shl r3d, 6 + test r3d, angled + jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) +.upsample_left: ; w4/w8 + neg hq + movd m0, [tlq+hq] + pshufb m0, m7 + movd [rsp+16*6+hq-4], m0 + movq m3, [rsp+16*5+7] + movq m0, [rsp+16*5+8] + movq m2, [rsp+16*5+9] + movq m4, [rsp+16*5+10] + movddup m5, [base+pb_36_m4] + punpcklbw m1, m0, m3 + punpcklbw m2, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + movshdup m3, [base+z2_dy_offset] +%if ARCH_X86_64 + mova m12, [base+z2_upsample] + add dyd, dyd +%else + mova m4, [base+z2_upsample] + shl dword [rsp+16*7+4*1], 1 + mova m12, m4 +%endif + paddw m1, m2 + pmulhrsw m1, m10 + movq [rsp+16*15], m3 + packuswb m1, m1 + punpcklbw m0, m1 + mova [rsp+16*5], m0 +.w4_main: + movd m6, dxd +%if ARCH_X86_64 + movd m3, dyd +%else + movd m3, [rsp+16*7+4*1] +%endif + movddup m0, [rsp+16*14+8*0] + pshufb m6, [base+pw_256] + paddw m7, m6, m6 + movq m5, [base+pw_m1to4] + pshuflw m4, m3, q0000 + punpcklqdq m6, m7 + pmullw m4, m5 + pshuflw m3, m3, q1111 + paddw m6, m0 + pshuflw m0, m4, q3333 + psubw m4, [rsp+16*15] + movq [rsp+16*6+8*1], m3 + movq [rsp+8*1], m0 ; dy*4 +%if ARCH_X86_64 + mov r8, dstq +%endif +.w4_loop0: +%if ARCH_X86_32 + mov r8, dstq +%endif + mova [rsp+16*12], m6 + mov r2d, r10d + movq [rsp+8*0], m4 + pand m0, m4, m8 + psraw m4, 6 + psubw m1, m9, m0 + psllw m0, 8 + por m0, m1 ; 64-frac_y, frac_y + movq [rsp+8*3], m0 + pabsw m4, m4 + movq [rsp+8*2], m4 + movzx hd, r9b +.w4_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movq m0, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + movhps m0, [rsp+r3] + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + movq m1, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + movhps m1, [rsp+r3] + pand m2, m8, m6 + paddsw m5, m6, m7 + psubw m3, m9, m2 + psllw m2, 8 + pshufb m0, m11 + por m2, m3 + pmaddubsw m0, m2 + pand m2, m8, m5 + psubw m3, m9, m2 + psllw m2, 8 + pshufb m1, m11 + por m2, m3 + pmaddubsw m1, m2 + cmp r3d, 127 ; topleft + jge .w4_toponly + movzx r3d, byte [rsp+8*2+0] ; base_y0 + movq m3, [rsp+r3] + movzx r3d, byte [rsp+8*2+2] ; base_y1 + movhps m3, [rsp+r3] + movzx r3d, byte [rsp+8*2+4] ; base_y2 + movq m4, [rsp+r3] + movzx r3d, byte [rsp+8*2+6] ; base_y3 + movhps m4, [rsp+r3] + pshufb m3, m12 + pshufb m4, m12 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + movddup m4, [rsp+8*3] + pmaddubsw m2, m4 + pmaddubsw m3, m4 + psraw m6, 15 ; base_x < topleft + pand m2, m6 + pandn m6, m0 + por m0, m2, m6 + psraw m6, m5, 15 + pand m3, m6 + pandn m6, m1 + por m1, m3, m6 +.w4_toponly: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + movifnidn strideq, stridemp + packuswb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + punpckhqdq m0, m0 + movd [dstq+strideq*0], m0 + psrlq m0, 32 + movd [dstq+strideq*1], m0 + sub hd, 4 + jz .w4_end + movq m4, [rsp+8*2] + movq m3, [rsp+16*6+8*1] + paddw m6, m5, m7 ; xpos += dx + psubw m4, m3 + movq [rsp+8*2], m4 + lea dstq, [dstq+strideq*2] + cmp r2d, r11d + jge .w4_loop + movddup m5, [rsp+8*3] +.w4_leftonly_loop: + movzx r3d, byte [rsp+8*2+0] ; base_y0 + movq m1, [rsp+r3] + movzx r3d, byte [rsp+8*2+2] ; base_y1 + movhps m1, [rsp+r3] + movzx r3d, byte [rsp+8*2+4] ; base_y2 + movq m2, [rsp+r3] + movzx r3d, byte [rsp+8*2+6] ; base_y3 + movhps m2, [rsp+r3] + psubw m4, m3 + pshufb m1, m12 + pshufb m2, m12 + movq [rsp+8*2], m4 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + movifnidn strideq, stridemp + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + punpckhqdq m0, m0 + movd [dstq+strideq*0], m0 + psrlq m0, 32 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 4 + jg .w4_leftonly_loop +.w4_end: + sub r9d, 1<<8 + jl .w4_ret + movq m4, [rsp+8*1] +%if ARCH_X86_64 + add r8, 4 + mov dstq, r8 +%else + mov dstq, r8 + add dstq, 4 +%endif + paddw m4, [rsp+8*0] ; base_y += 4*dy + movzx r3d, word [rsp+16*15+8*1] + add r10d, r3d + movddup m6, [rsp+16*15+8*1] + paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above) + jmp .w4_loop0 +.w4_ret: + RET +.w8: + test angled, 0x400 + jnz .w4_main + movd m5, [tlq+8] + lea r3d, [angleq+126] + pshufb m5, m7 +%if ARCH_X86_64 + mov r3b, hb +%else + xor r3b, r3b + or r3d, hd +%endif + movd [rsp+16*8+8], m5 + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + call .upsample_above + sub angled, 53 + lea r3d, [hq+7] + xor angled, 0x7f ; 180 - angle + movu m1, [base+z_filter_wh8] + movd m0, r3d + movd m6, angled + shr angled, 8 ; is_sm << 1 + psrldq m2, [base+z_filter_t_w48+angleq*8], 4 + pshufb m0, m7 + pshufb m6, m7 + pcmpeqb m0, m1 + pand m6, m0 + pcmpgtb m6, m2 +%if ARCH_X86_64 + movq [rsp+16*15+8*1], m10 ; 8<<6 +%else + movq m0, m10 + movq [rsp+16*15+8*1], m0 +%endif + jmp .w8_filter_left +.w8_no_upsample_above: + lea r3d, [hq+7] + mov [rsp], angled + sub angled, 90 + movd m0, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movu m3, [base+z_filter_wh8] + psrldq m4, [base+z_filter_t_w48+angleq*8], 4 + call .w8_filter_top + mov r3d, [rsp] + sub r3d, 141 +%if ARCH_X86_64 + mov r3b, hb +%else + xor r3b, r3b + or r3d, hd +%endif + cmp r3d, 8 + jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm +.w8_filter_left: + pmovmskb r5d, m6 + test r5d, r5d + jz .w4_main + imul r5d, 0x55555555 + mov r3, tlq + shr r5d, 30 + sub r5, 3 ; filter_strength-3 + jmp .filter_left +.w8_filter_top: + movd m6, r3d + REPX {pshufb x, m7}, m0, m1, m6 + pcmpeqb m0, m3 + pand m1, m0 + pand m6, m0 + pcmpgtb m1, m4 + pcmpgtb m6, m4 + pmovmskb r5d, m1 + test r5d, r5d + jz .w8_filter_top_end ; filter_strength == 0 + imul r5d, 0x55555555 + movq m0, [rsp+gprsize+16*8-2] + shr r5d, 30 + movq m1, [rsp+gprsize+16*8-1] + sub r5, 3 ; filter_strength-3 + movddup m7, [base+z_filter_k+8*2+r5*8+24*0] + punpcklbw m0, m1 + pmaddubsw m0, m7 + movq m1, [rsp+gprsize+16*8+0] + movq m2, [rsp+gprsize+16*8+1] + movddup m7, [base+z_filter_k+8*2+r5*8+24*1] + punpcklbw m1, m2 + pmaddubsw m1, m7 + movq m2, [rsp+gprsize+16*8+2] + movddup m7, [base+z_filter_k+8*2+r5*8+24*2] + punpcklbw m2, m2 + pmaddubsw m2, m7 + paddw m0, m1 + paddw m0, m2 +%if ARCH_X86_64 + mov r3d, r7m ; maxw, offset due to call +%else + mov r3d, [rsp+gprsize+16*18+4*3] +%endif + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + movq [rsp+gprsize+16*8], m0 + cmp r3d, 8 + jge .w8_filter_top_end + movq m0, [tlq+r3+1] + movq [rsp+gprsize+r3+16*8], m0 +.w8_filter_top_end: + ret +.w16: + test angled, 0x400 + jnz .w4_main + lea r3d, [hq+15] + sub angled, 90 + movd m0, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movd m6, r3d + REPX {pshufb x, m7}, m0, m1, m6 + movq m3, [base+z_filter_t_w16+angleq*4] + pcmpeqb m0, [base+z_filter_wh16] + pand m1, m0 + pand m6, m0 + pcmpgtb m1, m3 + pcmpgtb m6, m3 + pmovmskb r5d, m1 + mov r3, tlq + test r5d, r5d + jz .w16_filter_left ; filter_strength == 0 + imul r5d, 0x24924924 + pshufb m5, [base+z_filter_t_w16] ; tlq[16] + shr r5d, 30 + adc r5, -4 ; filter_strength-3 + movd [rsp+16*9], m5 + movddup m7, [base+z_filter_k+8*2+r5*8+24*0] + movu m1, [rsp+16*8-2] + movu m2, [rsp+16*8-1] + punpcklbw m0, m1, m2 + pmaddubsw m0, m7 + punpckhbw m1, m2 + pmaddubsw m1, m7 + movddup m7, [base+z_filter_k+8*2+r5*8+24*1] + mova m3, [rsp+16*8+0] + movu m4, [rsp+16*8+1] + punpcklbw m2, m3, m4 + pmaddubsw m2, m7 + punpckhbw m3, m4 + pmaddubsw m3, m7 + paddw m0, m2 + paddw m1, m3 + test r5d, r5d + jnz .w16_filter_end ; 3-tap + movddup m7, [base+z_filter_k+8*8] + movu m3, [rsp+16*8+2] + punpcklbw m2, m3, m3 + pmaddubsw m2, m7 + punpckhbw m3, m3 + pmaddubsw m3, m7 + paddw m0, m2 + paddw m1, m3 +.w16_filter_end: + mov r2d, maxwm + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + mova [rsp+16*8], m0 + cmp r2d, 16 + jge .w16_filter_left + movu m0, [r3+r2+1] + movu [rsp+r2+16*8], m0 +.w16_filter_left: + pmovmskb r5d, m6 + test r5d, r5d + jz .w4_main + imul r5d, 0x24924924 + shr r5d, 30 + adc r5, -4 ; filter_strength-3 + jmp .filter_left +.w32: + test angled, 0x400 + jnz .w4_main + pshufb m6, [base+z_filter_t_w16] ; tlq[32] + mov r3, tlq + lea tlq, [rsp+16*9] + movd [tlq+16*1], m6 + xor r5d, r5d ; filter_strength = 3 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + mova m0, [tlq+16*0] + mova m1, [tlq+16*1] + mov r2d, maxwm + mova [rsp+16*8], m0 + mova [rsp+16*9], m1 + cmp r2d, 32 + jge .filter_left + movu m0, [r3+r2+16*0+1] + movu m1, [r3+r2+16*1+1] + movu [rsp+r2+16*8], m0 + movu [rsp+r2+16*9], m1 + jmp .filter_left +.w64: + movu m0, [tlq+16*2+1] + movu m1, [tlq+16*3+1] + mova [rsp+16*10], m0 + mova [rsp+16*11], m1 + test angled, 0x400 + jnz .w4_main + pshufb m1, [base+z_filter_t_w16] ; tlq[64] + mov r3, tlq + lea tlq, [rsp+16*11] + movd [tlq+16*1], m1 + xor r5d, r5d ; filter_strength = 3 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + mova m0, [tlq+16*0] + mova m1, [tlq+16*1] + mova m2, [tlq+16*2] + mova m3, [tlq+16*3] + mov r2d, maxwm + mova [rsp+16* 8], m0 + mova [rsp+16* 9], m1 + mova [rsp+16*10], m2 + mova [rsp+16*11], m3 + cmp r2d, 64 + jge .filter_left + movu m0, [r3+r2+16*0+1] + movu m1, [r3+r2+16*1+1] + movu [rsp+r2+16* 8], m0 + movu [rsp+r2+16* 9], m1 + cmp r2d, 32 + jge .filter_left + movu m0, [r3+r2+16*2+1] + movu m1, [r3+r2+16*3+1] + movu [rsp+r2+16*10], m0 + movu [rsp+r2+16*11], m1 +.filter_left: + neg hq + movd m0, [r3+hq] + pxor m1, m1 + pshufb m0, m1 + movd [rsp+16*6+hq-4], m0 + lea tlq, [rsp+16*5] + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + cmp hd, -32 + jge .filter_left_end + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + mova m0, [tlq+16*0] + mova m1, [tlq+16*1] + mova [rsp+16*2], m0 + mova [rsp+16*3], m1 +.filter_left_end: + mov r2d, maxhm + mova m0, [rsp+16*5] + mova m1, [rsp+16*6] + mova m2, [rsp+16*7] + neg r2 + mova [rsp+16*4], m0 + mova [rsp+16*5], m1 + mova [rsp+16*6], m2 + cmp r2d, hd + jle .w4_main + movu m0, [r3+r2-16*2] + movu m1, [r3+r2-16*1] + movu [rsp+r2+16*4], m0 + movu [rsp+r2+16*5], m1 + cmp r2d, -32 + jle .w4_main + movu m0, [r3+r2-16*4] + movu m1, [r3+r2-16*3] + movu [rsp+r2+16*2], m0 + movu [rsp+r2+16*3], m1 + jmp .w4_main + +%if ARCH_X86_64 +cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w + %define base r7-$$ + lea r7, [$$] + mova m8, [base+pw_62] + mova m9, [base+pw_64] + mova m10, [base+pw_512] + mov org_wd, wd +%else +cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy + %define base r1-$$ + %define m8 [base+pw_62] + %define m9 [base+pw_64] + %define m10 [base+pw_512] + %define org_wd r5 + %define org_wq r5 + mov [dstq+strideq*0], strideq + mov [dstq+strideq*1], wd + LEA r1, $$ +%endif + tzcnt hd, hm + movifnidn angled, anglem + dec tlq + movsxd hq, [base+ipred_z3_ssse3_table+hq*4] + sub angled, 180 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + lea hq, [base+ipred_z3_ssse3_table+hq] + movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] + jmp hq +.h4: + lea r4d, [angleq+88] + test r4d, 0x480 + jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 + sar r4d, 9 + add r4d, wd + cmp r4d, 8 + jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) + movu m3, [tlq-7] + movu m1, [base+z_upsample1-4] + movu m4, [base+z_filter_s+2] + pshufb m0, m3, m1 + pxor m1, m1 + pshufb m2, m3, m1 + pshufb m1, m3, m4 + mova [rsp+16], m2 ; top[max_base_y] + movddup m2, [base+pb_36_m4] + add dyd, dyd + pmaddubsw m0, m2 + pmaddubsw m1, m2 + movd m5, dyd + mov r5d, dyd + pshufb m5, [base+pw_256] + paddw m0, m1 + pmulhrsw m0, m10 + shl wd, 2 + mov tlq, rsp + sub rsp, wq + packuswb m0, m0 + punpcklbw m0, m3 + paddw m6, m5, m5 + punpcklqdq m5, m6 + pshufb m0, [base+pb_15to0] + mova [tlq], m0 +.h4_upsample_loop: + lea r4d, [r5+dyq] + shr r5d, 6 + movq m0, [tlq+r5] + lea r5d, [r4+dyq] + shr r4d, 6 + movhps m0, [tlq+r4] + pand m2, m8, m5 + psubw m1, m9, m2 + psllw m2, 8 + por m1, m2 + pmaddubsw m0, m1 + paddw m5, m6 + pmulhrsw m0, m10 + packuswb m0, m0 + movq [rsp+wq-8], m0 + sub wd, 8 + jg .h4_upsample_loop + jmp .h4_transpose +.h4_no_upsample: + mov r4d, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h4_main + lea r4d, [wq+3] + movd m0, r4d + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + pcmpeqb m1, m0, [base+z_filter_wh4] + pand m1, m2 + pcmpgtb m1, [base+z_filter_t_w48+angleq*8] + pmovmskb r5d, m1 + mov r4d, 7 + test r5d, r5d + jz .h4_main ; filter_strength == 0 + movu m2, [tlq-7] + imul r5d, 0x55555555 + movu m3, [base+z_filter_s-2] + shr r5d, 30 ; filter_strength + mova m4, [base+z_upsample2] + movddup m5, [base+z_filter_k-8+r5*8+24*0] + movddup m6, [base+z_filter_k-8+r5*8+24*1] + movddup m7, [base+z_filter_k-8+r5*8+24*2] + pshufb m0, m2, m3 + shufps m3, m4, q2121 + pmaddubsw m1, m0, m5 + pmaddubsw m0, m6 + pshufb m5, m2, m3 + pmaddubsw m3, m5, m6 + pmaddubsw m5, m7 + pshufb m2, m4 + pmaddubsw m2, m7 + paddw m0, m1 + paddw m1, m3 + paddw m0, m5 + paddw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + lea r2d, [r4+2] + cmp wd, 4 + cmovne r4d, r2d + pshufd m0, m0, q0000 + lea tlq, [rsp+15] + packuswb m0, m1 + mova [rsp], m0 +.h4_main: + movd m5, dyd + movddup m0, [base+z_base_inc] ; base_inc << 6 + sub tlq, r4 + shl r4d, 6 + movd m7, [tlq] + movd m4, r4d + pshufb m5, [base+pw_256] + neg dyq + pshufb m7, [base+pw_m256] + mova m3, [base+z3_shuf_h4] + lea r5, [dyq+r4+63] ; ypos + pshufb m4, [base+pw_256] + psubw m4, m0 ; max_base_y + shl wd, 2 + paddw m6, m5, m5 + sub rsp, wq + punpcklqdq m5, m6 +.h4_loop: + lea r4, [r5+dyq] + sar r5, 6 + movq m0, [tlq+r5-4] + lea r5, [r4+dyq] + sar r4, 6 + movhps m0, [tlq+r4-4] + pand m2, m8, m5 + psubw m1, m9, m2 + psllw m2, 8 + pshufb m0, m3 + por m1, m2 + pmaddubsw m0, m1 + pcmpgtw m1, m4, m5 + paddw m5, m6 + pmulhrsw m0, m10 + pand m0, m1 + pandn m1, m7 + por m0, m1 + packuswb m0, m0 + movq [rsp+wq-8], m0 + sub wd, 8 + jz .h4_transpose + test r5d, r5d + jg .h4_loop + packuswb m7, m7 +.h4_end_loop: + movq [rsp+wq-8], m7 + sub wd, 8 + jg .h4_end_loop +.h4_transpose: + mova m1, [base+z_transpose4] +%if ARCH_X86_32 + mov strideq, [dstq] + mov org_wd, [dstq+strideq] +%endif + lea r2, [strideq*3] + lea dstq, [dstq+org_wq-4] +.h4_transpose_loop: + mova m0, [rsp] + add rsp, 16 + pshufb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m2, m0, q1032 + movd [dstq+strideq*1], m2 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r2 ], m0 + sub dstq, 4 + sub org_wd, 4 + jg .h4_transpose_loop + RET +.h8: + lea r4d, [angleq+88] + and r4d, ~0x7f + or r4d, wd + cmp r4d, 8 + ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 + mova m4, [tlq-15] + and r4d, 4 + movu m3, [tlq- 9] + movd m1, r4d + movu m2, [base+z_filter_s+2] + pxor m0, m0 + movu m5, [base+z_filter_s+6] + movddup m7, [base+pb_36_m4] + pshufb m1, m0 ; w & 4 + movu m0, [base+z_upsample1-4] + pmaxub m1, m0 ; clip 4x8 + add dyd, dyd + pshufb m0, m4, m1 + pmaddubsw m0, m7 + pshufb m1, m4, m2 + pmaddubsw m1, m7 + pshufb m2, m3, [base+z_upsample1] + pmaddubsw m2, m7 + pshufb m3, m5 + pmaddubsw m3, m7 + movd m5, dyd + neg dyq + paddw m1, m0 + paddw m2, m3 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + shl wd, 3 + lea tlq, [rsp+16] + pshufb m5, [base+pw_256] + sub rsp, wq + packuswb m1, m2 + lea r5, [dyq+63] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + mova [tlq-16*1], m0 + mova [tlq-16*0], m1 + paddw m6, m5, m5 + punpcklqdq m5, m6 +.h8_upsample_loop: + lea r4, [r5+dyq] + sar r5, 6 + movu m0, [tlq+r5] + lea r5, [r4+dyq] + sar r4, 6 + movu m1, [tlq+r4] + pand m3, m8, m5 + psubw m2, m9, m3 + psllw m2, 8 + por m3, m2 + pshufd m2, m3, q1010 + pmaddubsw m0, m2 + punpckhqdq m3, m3 + pmaddubsw m1, m3 + paddw m5, m6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m1, m0 + mova [rsp+wq-16], m1 + sub wd, 16 + jg .h8_upsample_loop + jmp .h8_transpose +.h8_no_upsample: + lea r4d, [wq+7] + movd m0, r4d + and r4d, 7 + or r4d, 8 ; imin(w+7, 15) + test angled, 0x400 + jnz .h8_main + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + movu m1, [base+z_filter_wh8] + psrldq m3, [base+z_filter_t_w48+angleq*8], 4 + pcmpeqb m1, m0 + pand m1, m2 + pcmpgtb m1, m3 + pmovmskb r5d, m1 + test r5d, r5d + jz .h8_main ; filter_strength == 0 + mova m0, [tlq-15] + imul r5d, 0x55555555 + movd m1, [tlq+1] + neg r4 + movd m2, [tlq+r4] + shr r5d, 30 + pxor m7, m7 + lea tlq, [rsp+16*2] + sub r5, 3 ; filter_strength-3 + mova [tlq+16*0], m0 + pshufb m1, m7 + mova [tlq+16*1], m1 + pshufb m2, m7 + movq [tlq+r4+8], m2 + neg r4d + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sar r5d, 1 + add tlq, 31 + add r5d, 17 + cmp wd, 8 + cmova r4d, r5d +.h8_main: + movd m5, dyd + sub tlq, r4 + shl r4d, 6 + movd m7, [tlq] + movd m4, r4d + pshufb m5, [base+pw_256] + neg dyq + pshufb m7, [base+pw_m256] + mova m3, [base+z3_shuf] + lea r5, [dyq+r4+63] + pshufb m4, [base+pw_256] + psubw m4, [base+z3_base_inc] + shl wd, 3 + mova m6, m5 + sub rsp, wq +.h8_loop: + mov r4, r5 + sar r4, 6 + movu m0, [tlq+r4-8] + pand m2, m8, m5 + psubw m1, m9, m2 + psllw m2, 8 + pshufb m0, m3 + por m1, m2 + pmaddubsw m0, m1 + pcmpgtw m1, m4, m5 + paddw m5, m6 + pmulhrsw m0, m10 + pand m0, m1 + pandn m1, m7 + por m0, m1 + packuswb m0, m0 + movq [rsp+wq-8], m0 + sub wd, 8 + jz .h8_transpose + add r5, dyq + jg .h8_loop + packuswb m7, m7 +.h8_end_loop: + movq [rsp+wq-8], m7 + sub wd, 8 + jg .h8_end_loop +.h8_transpose: +%if ARCH_X86_32 + mov strideq, [dstq] + mov org_wd, [dstq+strideq] +%endif + or r3d, 8 + cmp org_wd, 4 +%if ARCH_X86_64 + jne .end_transpose_main +%else + jne .end_transpose_loop +%endif + mova m1, [rsp+16*1] + mova m0, [rsp+16*0] + lea r2, [strideq*3] + add rsp, 16*2 + punpcklbw m2, m1, m0 + punpckhbw m1, m0 + punpckhbw m0, m1, m2 + punpcklbw m1, m2 +.write_4x8_end: + call .write_4x8 + RET +.write_4x8: + movd [dstq+r2 ], m0 + pshuflw m4, m0, q1032 + movd [dstq+strideq*2], m4 + punpckhqdq m0, m0 + movd [dstq+strideq*1], m0 + psrlq m0, 32 + movd [dstq+strideq*0], m0 + lea dstq, [dstq+strideq*4] + movd [dstq+r2 ], m1 + pshuflw m4, m1, q1032 + movd [dstq+strideq*2], m4 + punpckhqdq m1, m1 + movd [dstq+strideq*1], m1 + psrlq m1, 32 + movd [dstq+strideq*0], m1 + ret +.h16: + lea r4d, [wq+15] + movd m0, r4d + and r4d, 15 + or r4d, 16 ; imin(w+15, 31) + test angled, 0x400 + jnz .h16_main + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + movq m3, [base+z_filter_t_w16+angleq*4] + pcmpeqb m1, m0, [base+z_filter_wh16] + pand m1, m2 + pcmpgtb m1, m3 + pmovmskb r5d, m1 + test r5d, r5d + jz .h16_main ; filter_strength == 0 + mova m0, [tlq-16*2+1] + imul r5d, 0x24924924 + mova m1, [tlq-16*1+1] + neg r4 + movd m2, [tlq-16*0+1] + shr r5d, 30 + movd m3, [tlq+r4] + adc r5, -4 ; filter_strength-3 + pxor m7, m7 + lea tlq, [rsp+16*2] + mova [tlq-16*1], m0 + pshufb m2, m7 + mova [tlq+16*0], m1 + pshufb m3, m7 + mova [tlq+16*1], m2 + movq [tlq+r4+8], m3 + neg r4d + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + add tlq, 31 + cmp wd, 16 + jle .h16_main + pshuflw m0, [tlq-47], q0000 + sar r5, 1 + movq m1, [base+z3_filter_k_tail+r5*4] + lea r4d, [r5+33] + pmaddubsw m0, m1 +%if ARCH_X86_64 + pmulhrsw m0, m10 +%else + pmulhrsw m0, m4 +%endif + packuswb m0, m0 + movd [tlq-35], m0 +.h16_main: + movd m5, dyd + sub tlq, r4 + movd m4, r4d + shl r4d, 6 + movd m7, [tlq] + pxor m6, m6 + pshufb m5, [base+pw_256] + neg dyq + pshufb m7, m6 + mova m3, [base+z3_shuf] + lea r5, [dyq+r4+63] + pshufb m4, m6 + psubb m4, [base+pb_15to0] + shl wd, 4 + mova m6, m5 + sub rsp, wq +.h16_loop: + mov r4, r5 + pand m2, m8, m5 + sar r4, 6 + psubw m1, m9, m2 + psllw m2, 8 + movu m0, [tlq+r4-8*2] + por m2, m1 + movu m1, [tlq+r4-8*1] + pshufb m0, m3 + pmaddubsw m0, m2 + pshufb m1, m3 + pmaddubsw m1, m2 + psrlw m2, m5, 6 + paddw m5, m6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packsswb m2, m2 + packuswb m0, m1 + pcmpgtb m1, m4, m2 + pand m0, m1 + pandn m1, m7 + por m0, m1 + mova [rsp+wq-16], m0 + sub wd, 16 + jz .h16_transpose + add r5, dyq + jg .h16_loop +.h16_end_loop: + mova [rsp+wq-16], m7 + sub wd, 16 + jg .h16_end_loop +.h16_transpose: +%if ARCH_X86_32 + mov strideq, [dstq] + mov org_wd, [dstq+strideq] +%endif + or r3d, 16 + cmp org_wd, 4 +%if ARCH_X86_64 + jne .end_transpose_main +%else + jne .end_transpose_loop +%endif +.h16_transpose_w4: + mova m2, [rsp+16*3] + mova m4, [rsp+16*2] + mova m3, [rsp+16*1] + mova m0, [rsp+16*0] + lea r2, [strideq*3] + add rsp, 16*4 + punpckhbw m1, m2, m4 + punpcklbw m2, m4 + punpckhbw m4, m3, m0 + punpcklbw m3, m0 + punpckhwd m0, m1, m4 + punpcklwd m1, m4 + call .write_4x8 + lea dstq, [dstq+strideq*4] + punpckhwd m0, m2, m3 + punpcklwd m1, m2, m3 + jmp .write_4x8_end +.h32: + lea r4d, [wq+31] + and r4d, 31 + or r4d, 32 ; imin(w+31, 63) + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h32_main + mova m0, [tlq-16*4+1] + mova m1, [tlq-16*3+1] + mova m2, [tlq-16*2+1] + mova m3, [tlq-16*1+1] + movd m4, [tlq-16*0+1] + neg r4 + movd m5, [tlq+r4] + pxor m7, m7 + lea tlq, [rsp+16*4] + mova [tlq-16*3], m0 + mova [tlq-16*2], m1 + xor r5d, r5d ; filter_strength = 3 + mova [tlq-16*1], m2 + pshufb m4, m7 + mova [tlq+16*0], m3 + pshufb m5, m7 + mova [tlq+16*1], m4 + movq [tlq+r4+8], m5 + neg r4d + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + add tlq, 63 + cmp wd, 32 + jle .h32_main + pshuflw m0, [tlq-79], q0000 + movq m1, [base+z3_filter_k_tail] + add r4d, 2 + pmaddubsw m0, m1 +%if ARCH_X86_64 + pmulhrsw m0, m10 +%else + pmulhrsw m0, m4 +%endif + packuswb m0, m0 + movd [tlq-67], m0 +.h32_main: + movd m5, dyd + sub tlq, r4 + movd m4, r4d + shl r4d, 6 + movd m7, [tlq] + pxor m6, m6 + pshufb m5, [base+pw_256] + neg dyq + pshufb m7, m6 + mova m3, [base+z3_shuf] + lea r5, [dyq+r4+63] + pshufb m4, m6 + psubb m4, [base+pb_15to0] + mova m6, m5 +.h32_loop: + mov r4, r5 + pand m2, m8, m5 + sar r4, 6 + psubw m1, m9, m2 + psllw m2, 8 + movu m0, [tlq+r4-8*4] + por m2, m1 + movu m1, [tlq+r4-8*3] + pshufb m0, m3 + pmaddubsw m0, m2 + pshufb m1, m3 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + sub rsp, 32 + packuswb m0, m1 + mova [rsp+16*0], m0 + movu m0, [tlq+r4-8*2] + movu m1, [tlq+r4-8*1] + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + psrlw m2, m5, 6 + paddw m5, m6 + packsswb m2, m2 + packuswb m0, m1 + pcmpgtb m1, m4, m2 + paddsb m2, [base+pb_16] + pand m0, m1 + pandn m1, m7 + por m0, m1 + pcmpgtb m1, m4, m2 + mova [rsp+16*1], m0 + pand m0, m1, [rsp+16*0] + pandn m1, m7 + por m0, m1 + mova [rsp+16*0], m0 + dec wd + jz .h32_transpose + add r5, dyq + jg .h32_loop +.h32_end_loop: + sub rsp, 32 + mova [rsp+16*1], m7 + mova [rsp+16*0], m7 + dec wd + jg .h32_end_loop +.h32_transpose: + or r3d, 32 + jmp .end_transpose_main +.h64: + lea r4d, [wq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h64_main + mova m0, [tlq-16*8+1] + mova m1, [tlq-16*7+1] + mova m2, [tlq-16*6+1] + mova m3, [tlq-16*5+1] + mova [rsp+16*1], m0 + mova [rsp+16*2], m1 + mova [rsp+16*3], m2 + mova [rsp+16*4], m3 + mova m0, [tlq-16*4+1] + mova m1, [tlq-16*3+1] + mova m2, [tlq-16*2+1] + mova m3, [tlq-16*1+1] + movd m4, [tlq-16*0+1] + neg r4 + movd m5, [tlq+r4] + pxor m7, m7 + lea tlq, [rsp+16*8] + mova [tlq-16*3], m0 + mova [tlq-16*2], m1 + xor r5d, r5d ; filter_strength = 3 + mova [tlq-16*1], m2 + pshufb m4, m7 + mova [tlq+16*0], m3 + pshufb m5, m7 + mova [tlq+16*1], m4 + movq [tlq+r4+8], m5 + neg r4d + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + cmp wd, 64 + jl .h64_filter96 ; skip one call if the last 32 bytes aren't used + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge +.h64_filter96: + add tlq, 127 +.h64_main: + movd m5, dyd + sub tlq, r4 + movd m4, r4d + shl r4d, 6 + movd m7, [tlq] + pxor m6, m6 + pshufb m5, [base+pw_256] + neg dyq + pshufb m7, m6 + mova m3, [base+z3_shuf] + lea r5, [dyq+r4+63] + pshufb m4, m6 + psubb m4, [base+pb_15to0] + mova m6, m5 +.h64_loop: + mov r4, r5 + pand m2, m8, m5 + sar r4, 6 + psubw m1, m9, m2 + psllw m2, 8 + movu m0, [tlq+r4-8*8] + por m2, m1 + movu m1, [tlq+r4-8*7] + pshufb m0, m3 + pmaddubsw m0, m2 + pshufb m1, m3 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + sub rsp, 64 + packuswb m0, m1 + mova [rsp+16*0], m0 + movu m0, [tlq+r4-8*6] + movu m1, [tlq+r4-8*5] + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + mova [rsp+16*1], m0 + movu m0, [tlq+r4-8*4] + movu m1, [tlq+r4-8*3] + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + mova [rsp+16*2], m0 + movu m0, [tlq+r4-8*2] + movu m1, [tlq+r4-8*1] + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + psrlw m2, m5, 6 + paddw m5, m6 + packsswb m2, m2 + packuswb m0, m1 + pcmpgtb m1, m4, m2 + paddsb m2, [base+pb_16] + pand m0, m1 + pandn m1, m7 + por m0, m1 + pcmpgtb m1, m4, m2 + paddsb m2, [base+pb_16] + mova [rsp+16*3], m0 + pand m0, m1, [rsp+16*2] + pandn m1, m7 + por m0, m1 + pcmpgtb m1, m4, m2 + paddsb m2, [base+pb_16] + mova [rsp+16*2], m0 + pand m0, m1, [rsp+16*1] + pandn m1, m7 + por m0, m1 + pcmpgtb m1, m4, m2 + mova [rsp+16*1], m0 + pand m0, m1, [rsp+16*0] + pandn m1, m7 + por m0, m1 + mova [rsp+16*0], m0 + dec wd + jz .h64_transpose + add r5, dyq + jg .h64_loop +.h64_end_loop: + sub rsp, 64 + mova [rsp+16*3], m7 + mova [rsp+16*2], m7 + mova [rsp+16*1], m7 + mova [rsp+16*0], m7 + dec wd + jg .h64_end_loop +.h64_transpose: + or r3d, 64 +.end_transpose_main: +%if ARCH_X86_64 + lea r5, [r3*3] + lea r7, [strideq*3] +%else + mov strideq, [dstq] + mov org_wd, [dstq+strideq] +%endif +.end_transpose_loop: + lea r4, [rsp+r3-8] + lea r6, [dstq+org_wq-8] +.end_transpose_loop_y: + movq m0, [r4+r3*1] + movq m4, [r4+r3*0] +%if ARCH_X86_64 + movq m1, [r4+r5 ] + movq m5, [r4+r3*2] + lea r2, [r4+r3*4] +%else + lea r2, [r4+r3*2] + movq m1, [r2+r3*1] + movq m5, [r2+r3*0] + lea r2, [r2+r3*2] +%endif + movq m2, [r2+r3*1] + movq m6, [r2+r3*0] +%if ARCH_X86_64 + movq m3, [r2+r5 ] + movq m7, [r2+r3*2] +%else + lea r2, [r2+r3*2] + movq m3, [r2+r3*1] + movq m7, [r2+r3*0] +%endif + sub r4, 8 + punpcklbw m0, m4 + punpcklbw m1, m5 + punpcklbw m2, m6 + punpcklbw m3, m7 + punpckhwd m4, m1, m0 + punpcklwd m1, m0 + punpckhwd m0, m3, m2 + punpcklwd m3, m2 + punpckhdq m2, m3, m1 + punpckldq m3, m1 + punpckldq m1, m0, m4 + punpckhdq m0, m4 + movhps [r6+strideq*0], m0 + movq [r6+strideq*1], m0 +%if ARCH_X86_64 + movhps [r6+strideq*2], m1 + movq [r6+r7 ], m1 + lea r6, [r6+strideq*4] +%else + lea r6, [r6+strideq*2] + movhps [r6+strideq*0], m1 + movq [r6+strideq*1], m1 + lea r6, [r6+strideq*2] +%endif + movhps [r6+strideq*0], m2 + movq [r6+strideq*1], m2 +%if ARCH_X86_64 + movhps [r6+strideq*2], m3 + movq [r6+r7 ], m3 + lea r6, [r6+strideq*4] +%else + lea r6, [r6+strideq*2] + movhps [r6+strideq*0], m3 + movq [r6+strideq*1], m3 + lea r6, [r6+strideq*2] +%endif + cmp r4, rsp + jae .end_transpose_loop_y + lea rsp, [rsp+r3*8] + sub org_wd, 8 + jg .end_transpose_loop + RET + +;--------------------------------------------------------------------------------------- +;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, +; const uint8_t *idx, const int w, const int h); +;--------------------------------------------------------------------------------------- +cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h + mova m4, [palq] + LEA r2, pal_pred_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r2+wq*4] + packuswb m4, m4 + add wq, r2 + lea r2, [strideq*3] + jmp wq +.w4: + pshufb m0, m4, [idxq] + add idxq, 16 + movd [dstq ], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq ], m1 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r2 ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +ALIGN function_align +.w8: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + add idxq, 32 + movq [dstq ], m0 + movhps [dstq+strideq ], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r2 ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8 + RET +ALIGN function_align +.w16: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + pshufb m2, m4, [idxq+32] + pshufb m3, m4, [idxq+48] + add idxq, 64 + mova [dstq ], m0 + mova [dstq+strideq ], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r2 ], m3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16 + RET +ALIGN function_align +.w32: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + pshufb m2, m4, [idxq+32] + pshufb m3, m4, [idxq+48] + add idxq, 64 + mova [dstq ], m0 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32 + RET +ALIGN function_align +.w64: + pshufb m0, m4, [idxq] + pshufb m1, m4, [idxq+16] + pshufb m2, m4, [idxq+32] + pshufb m3, m4, [idxq+48] + add idxq, 64 + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, strideq + sub hd, 1 + jg .w64 + RET + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +%macro IPRED_CFL 1 ; ac in, unpacked pixels out + psignw m3, m%1, m1 + pabsw m%1, m%1 + pmulhrsw m%1, m2 + psignw m%1, m3 + paddw m%1, m0 +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + movifnidn wd, wm + movifnidn hd, hm + tzcnt r6d, hd + lea t0d, [wq+hq] + movd m4, t0d + tzcnt t0d, t0d + movd m5, t0d + LEA t0, ipred_cfl_ssse3_table + tzcnt wd, wd + movsxd r6, [t0+r6*4] + movsxd wq, [t0+wq*4+16] + pcmpeqd m3, m3 + psrlw m4, 1 + add r6, t0 + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h4: + movd m0, [tlq-4] + pmaddubsw m0, m3 + jmp wq +.w4: + movd m1, [tlq+1] + pmaddubsw m1, m3 + psubw m0, m4 + paddw m0, m1 + pmaddwd m0, m3 + cmp hd, 4 + jg .w4_mul + psrlw m0, 3 ; dc >>= ctz(width + height); + jmp .w4_end +.w4_mul: + punpckhqdq m1, m0, m0 + paddw m0, m1 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + psrlw m0, 2 + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8 + cmovz r6d, r2d + movd m5, r6d + pmulhuw m0, m5 +.w4_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s4: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s4_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + movd [dstq+strideq*0], m4 + pshuflw m4, m4, q1032 + movd [dstq+strideq*1], m4 + punpckhqdq m4, m4 + movd [dstq+strideq*2], m4 + psrlq m4, 32 + movd [dstq+r6 ], m4 + lea dstq, [dstq+strideq*4] + add acq, 32 + sub hd, 4 + jg .s4_loop + RET +ALIGN function_align +.h8: + movq m0, [tlq-8] + pmaddubsw m0, m3 + jmp wq +.w8: + movq m1, [tlq+1] + pmaddubsw m1, m3 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + paddw m0, m1 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 8 + je .w8_end + mov r6d, 0x5556 + mov r2d, 0x3334 + cmp hd, 32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w8_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s8: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + lea r6, [strideq*3] + pabsw m2, m1 + psllw m2, 9 +.s8_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + movq [dstq ], m4 + movhps [dstq+strideq ], m4 + mova m4, [acq+32] + mova m5, [acq+48] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + movq [dstq+strideq*2], m4 + movhps [dstq+r6 ], m4 + lea dstq, [dstq+strideq*4] + add acq, 64 + sub hd, 4 + jg .s8_loop + RET +ALIGN function_align +.h16: + mova m0, [tlq-16] + pmaddubsw m0, m3 + jmp wq +.w16: + movu m1, [tlq+1] + pmaddubsw m1, m3 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 16 + je .w16_end + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 8|32 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w16_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s16: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s16_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq], m4 + mova m4, [acq+32] + mova m5, [acq+48] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + add acq, 64 + sub hd, 2 + jg .s16_loop + RET +ALIGN function_align +.h32: + mova m0, [tlq-32] + pmaddubsw m0, m3 + mova m2, [tlq-16] + pmaddubsw m2, m3 + paddw m0, m2 + jmp wq +.w32: + movu m1, [tlq+1] + pmaddubsw m1, m3 + movu m2, [tlq+17] + pmaddubsw m2, m3 + paddw m1, m2 + paddw m0, m1 + psubw m4, m0 + punpckhqdq m0, m0 + psubw m0, m4 + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 + pmaddwd m0, m3 + psrlw m0, m5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x5556 + mov r2d, 0x3334 + test hd, 64|16 + cmovz r6d, r2d + movd m1, r6d + pmulhuw m0, m1 +.w32_end: + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 +.s32: + movd m1, alpham + pshuflw m1, m1, q0000 + punpcklqdq m1, m1 + pabsw m2, m1 + psllw m2, 9 +.s32_loop: + mova m4, [acq] + mova m5, [acq+16] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq], m4 + mova m4, [acq+32] + mova m5, [acq+48] + IPRED_CFL 4 + IPRED_CFL 5 + packuswb m4, m5 + mova [dstq+16], m4 + add dstq, strideq + add acq, 64 + dec hd + jg .s32_loop + RET + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + mov hd, hm ; zero upper half + tzcnt r6d, hd + sub tlq, hq + tzcnt wd, wm + movu m0, [tlq] + mov t0d, 0x8000 + movd m3, t0d + movd m2, r6d + psrld m3, m2 + LEA t0, ipred_cfl_left_ssse3_table + movsxd r6, [t0+r6*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 +.h32: + movu m1, [tlq+16] ; unaligned when jumping here from dc_top + pmaddubsw m1, m2 + paddw m0, m1 +.h16: + pshufd m1, m0, q3232 ; psrlq m1, m0, 16 + paddw m0, m1 +.h8: + pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 + paddw m0, m1 +.h4: + pmaddwd m0, m2 + pmulhrsw m0, m3 + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + jmp wq + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + LEA t0, ipred_cfl_left_ssse3_table + tzcnt wd, wm + inc tlq + movu m0, [tlq] + movifnidn hd, hm + mov r6d, 0x8000 + movd m3, r6d + movd m2, wd + psrld m3, m2 + movsxd r6, [t0+wq*4] + pcmpeqd m2, m2 + pmaddubsw m0, m2 + add r6, t0 + add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table + movsxd wq, [t0+wq*4] + add wq, t0 + movifnidn acq, acmp + jmp r6 + +;--------------------------------------------------------------------------------------- +;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, +; const int width, const int height, const int16_t *ac, const int alpha); +;--------------------------------------------------------------------------------------- +cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha + tzcnt wd, wm + movifnidn hd, hm + LEA r6, ipred_cfl_splat_ssse3_table + movsxd wq, [r6+wq*4] + movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] + add wq, r6 + movifnidn acq, acmp + jmp wq + +%macro RELOAD_ACQ_32 1 + mov acq, ac_bakq ; restore acq +%endmacro + +%if ARCH_X86_64 +cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak +DECLARE_REG_TMP 7 + movddup m2, [pb_2] +%else +cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h +DECLARE_REG_TMP 4 +%define ac_bakq acmp + mov t0d, 0x02020202 + movd m2, t0d + pshufd m2, m2, q0000 +%endif + movifnidn wd, wm + mov t0d, hm + mov hd, t0d + imul t0d, wd + movd m5, t0d + movifnidn hpadd, hpadm +%if ARCH_X86_64 + mov ac_bakq, acq +%endif + shl hpadd, 2 + sub hd, hpadd + pxor m4, m4 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through +%if ARCH_X86_64 + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak +%else + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h +%endif +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq m0, [yq] + movq m1, [yq+strideq] + movhps m0, [yq+strideq*2] + movhps m1, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 16 + sub hd, 2 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_4_8 + punpckhqdq m0, m0 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 16 + sub hpadd, 2 + jg .w4_hpad_loop + jmp .calc_avg_4_8 +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + mova m0, [yq+strideq*2] + mova m1, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq+16], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 2 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_4_8 + jmp .w8_hpad +.w8_wpad: ; wpadd=1 + movddup m0, [yq] + movddup m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufhw m0, m0, q3333 + mova [acq], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 16 + sub hd, 1 + jg .w8_wpad + test hpadd, hpadd + jz .calc_avg_4_8 +.w8_hpad: + mova [acq], m0 + paddw m4, m0 + add acq, 16 + sub hpadd, 1 + jg .w8_hpad + jmp .calc_avg_4_8 +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + mova m6, [yq+16] + mova m1, [yq+strideq+16] + pmaddubsw m6, m2 + pmaddubsw m1, m2 + paddw m6, m1 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_loop + test hpadd, hpadd + jz .calc_avg16 + jmp .w16_hpad_loop +.w16_wpad: + cmp wpadd, 2 + jl .w16_pad1 + je .w16_pad2 +.w16_pad3: + movddup m0, [yq] + movddup m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + pshufhw m0, m0, q3333 + mova [acq], m0 + paddw m4, m0 + mova m6, m0 + punpckhqdq m6, m0, m0 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_pad3 + jmp .w16_wpad_done +.w16_pad2: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + pshufhw m6, m0, q3333 + punpckhqdq m6, m6 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_pad2 + jmp .w16_wpad_done +.w16_pad1: + mova m0, [yq] + mova m1, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + paddw m0, m1 + mova [acq], m0 + paddw m4, m0 + movddup m6, [yq+16] + movddup m1, [yq+strideq+16] + pmaddubsw m6, m2 + pmaddubsw m1, m2 + paddw m6, m1 + pshufhw m6, m6, q3333 + mova [acq+16], m6 + paddw m4, m6 + lea yq, [yq+strideq*2] + add acq, 32 + dec hd + jg .w16_pad1 +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg16 +.w16_hpad_loop: + mova [acq], m0 + paddw m4, m0 + mova [acq+16], m6 + paddw m4, m6 + add acq, 32 + dec hpadd + jg .w16_hpad_loop + jmp .calc_avg16 + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak +%else + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h +%endif +.calc_avg_4_8: + psrlw m2, 9 + pmaddwd m4, m2 + jmp .calc_avg +.calc_avg16: + psrld m0, m4, 16 + pslld m4, 16 + psrld m4, 16 + paddd m4, m0 +.calc_avg: + movd szd, m5 + psrad m5, 1 + tzcnt r1d, szd + paddd m4, m5 + movd m1, r1d + pshufd m0, m4, q2301 + paddd m0, m4 + pshufd m4, m0, q1032 + paddd m0, m4 + psrad m0, m1 ; sum >>= log2sz; + packssdw m0, m0 + RELOAD_ACQ_32 acq +.sub_loop: + mova m1, [acq] + psubw m1, m0 ; ac[x] -= sum; + mova [acq], m1 + add acq, 16 + sub szd, 8 + jg .sub_loop + RET + +%if ARCH_X86_64 +cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak + movddup m2, [pb_4] +%else +cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h + mov t0d, 0x04040404 + movd m2, t0d + pshufd m2, m2, q0000 +%endif + movifnidn wd, wm + mov t0d, hm + mov hd, t0d + imul t0d, wd + movd m6, t0d + movifnidn hpadd, hpadm +%if ARCH_X86_64 + mov ac_bakq, acq +%endif + shl hpadd, 2 + sub hd, hpadd + pxor m4, m4 + pxor m5, m5 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak +%else + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h +%endif +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movq m1, [yq] + movhps m1, [yq+strideq] + movq m0, [yq+strideq*2] + movhps m0, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_4 + punpckhqdq m0, m0 +.w4_hpad_loop: + mova [acq], m0 + paddw m4, m0 + add acq, 16 + sub hpadd, 2 + jg .w4_hpad_loop + jmp .calc_avg_4 +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + mova m1, [yq] + mova m0, [yq+strideq] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m0 + paddw m5, m1 + mova m1, [yq+strideq*2] + mova m0, [yq+stride3q] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w8_hpad +.w8_wpad: + movddup m1, [yq] + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq], m1 + paddw m5, m1 + movddup m0, [yq+strideq] + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+16], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + sub hd, 2 + jg .w8_wpad + test hpadd, hpadd + jz .calc_avg_8_16 +.w8_hpad: + mova [acq], m0 + paddw m4, m0 + mova [acq+16], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad + jmp .calc_avg_8_16 +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m1, [yq] + mova m0, [yq+16] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m5, m0 + paddw m5, m1 + mova m1, [yq+strideq] + mova m0, [yq+strideq+16] + pmaddubsw m0, m2 + pmaddubsw m1, m2 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m0 + paddw m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w16_hpad_loop +.w16_wpad: + cmp wpadd, 2 + jl .w16_pad1 + je .w16_pad2 +.w16_pad3: + movddup m1, [yq] + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq], m1 + paddw m5, m1 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + movddup m1, [yq+strideq] + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq+32], m1 + paddw m4, m1 + punpckhqdq m0, m1, m1 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad3 + jmp .w16_wpad_done +.w16_pad2: + mova m1, [yq] + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + pshufhw m1, m1, q3333 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + mova m1, [yq+strideq] + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + mova m0, m1 + pshufhw m0, m0, q3333 + punpckhqdq m0, m0 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad2 + jmp .w16_wpad_done +.w16_pad1: + mova m1, [yq] + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + movddup m0, [yq+16] + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+16], m0 + paddw m5, m0 + mova m1, [yq+strideq] + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + movddup m0, [yq+strideq+16] + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad1 +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg_8_16 +.w16_hpad_loop: + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m1 + paddw m5, m0 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m1 + paddw m5, m0 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop + jmp .calc_avg_8_16 + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak +%else + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h +%endif +.calc_avg_4: + psrlw m2, 10 + pmaddwd m5, m2 + pmaddwd m0, m4, m2 + jmp .calc_avg +.calc_avg_8_16: + mova m0, m5 + psrld m5, 16 + pslld m0, 16 + psrld m0, 16 + paddd m5, m0 + mova m0, m4 + psrld m0, 16 + pslld m4, 16 + psrld m4, 16 + paddd m0, m4 +.calc_avg: + paddd m5, m0 + movd szd, m6 + psrad m6, 1 + tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); + paddd m5, m6 + movd m1, r1d + pshufd m0, m5, q2301 + paddd m0, m5 + pshufd m5, m0, q1032 + paddd m0, m5 + psrad m0, m1 ; sum >>= log2sz; + packssdw m0, m0 + RELOAD_ACQ_32 acq ; ac = ac_orig +.sub_loop: + mova m1, [acq] + psubw m1, m0 + mova [acq], m1 + add acq, 16 + sub szd, 8 + jg .sub_loop + RET + +%if ARCH_X86_64 +cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak + movddup m2, [pb_4] +%else +cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h +%define ac_bakq [rsp+16*4] + mov t0d, 0x04040404 + movd m2, t0d + pshufd m2, m2, q0000 +%endif + movifnidn wd, wm + movifnidn hpadd, hpadm + movd m0, hpadd + mov t0d, hm + mov hd, t0d + imul t0d, wd + movd m6, t0d + movd hpadd, m0 + mov ac_bakq, acq + shl hpadd, 2 + sub hd, hpadd + pxor m5, m5 + pxor m4, m4 + cmp wd, 16 + jg .w32 + cmp wd, 8 + jg .w16 + je .w8 + ; fall-through + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak +%else + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h +%endif +.w4: + lea stride3q, [strideq*3] +.w4_loop: + movd m1, [yq] + movd m3, [yq+strideq] + punpckldq m1, m3 + punpcklbw m1, m1 + movd m0, [yq+strideq*2] + movd m3, [yq+stride3q] + punpckldq m0, m3 + punpcklbw m0, m0 + pmaddubsw m1, m2 + pmaddubsw m0, m2 + mova [acq], m1 + mova [acq+16], m0 + paddw m5, m0 + paddw m5, m1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_4 + punpckhqdq m0, m0 +.w4_hpad_loop: + mova [acq], m0 + paddw m5, m0 + add acq, 16 + sub hpadd, 2 + jg .w4_hpad_loop +.calc_avg_4: + psrlw m2, 10 + pmaddwd m5, m2 + jmp .calc_avg + +.w8: + lea stride3q, [strideq*3] + test wpadd, wpadd + jnz .w8_wpad +.w8_loop: + movq m1, [yq] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + movq m0, [yq+strideq] + punpcklbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0 + movq m1, [yq+strideq*2] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + movq m0, [yq+stride3q] + punpcklbw m0, m0 + pmaddubsw m0, m2 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w8_hpad +.w8_wpad: + movd m1, [yq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pmaddubsw m1, m2 + pshufhw m1, m1, q3333 + mova [acq], m1 + paddw m5, m1 + movd m0, [yq+strideq] + punpcklbw m0, m0 + punpcklqdq m0, m0 + pmaddubsw m0, m2 + pshufhw m0, m0, q3333 + mova [acq+16], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 32 + sub hd, 2 + jg .w8_wpad + test hpadd, hpadd + jz .calc_avg_8_16 +.w8_hpad: + mova [acq], m0 + paddw m5, m0 + mova [acq+16], m0 + paddw m4, m0 + add acq, 32 + sub hpadd, 2 + jg .w8_hpad + jmp .calc_avg_8_16 + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0 + mova m0, [yq+strideq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg_8_16 + jmp .w16_hpad_loop +.w16_wpad: + cmp wpadd, 2 + jl .w16_pad1 + je .w16_pad2 +.w16_pad3: + movd m1, [yq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pshufhw m1, m1, q3333 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + movd m1, [yq+strideq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pshufhw m1, m1, q3333 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + punpckhqdq m0, m1, m1 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad3 + jmp .w16_wpad_done +.w16_pad2: + movq m1, [yq] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + pshufhw m1, m1, q3333 + punpckhqdq m1, m1 + mova [acq+16], m1 + paddw m5, m1 + movq m1, [yq+strideq] + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + mova m0, m1 + pshufhw m0, m0, q3333 + punpckhqdq m0, m0 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad2 + jmp .w16_wpad_done +.w16_pad1: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1 + punpckhbw m0, m0 + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0 + mova m0, [yq+strideq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq+32], m1 + paddw m4, m1 + punpckhbw m0, m0 + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 + pmaddubsw m0, m2 + mova [acq+48], m0 + paddw m4, m0 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_pad1 +.w16_wpad_done: + test hpadd, hpadd + jz .calc_avg_8_16 +.w16_hpad_loop: + mova [acq], m1 + mova [acq+16], m0 + paddw m4, m1 + paddw m5, m0 + mova [acq+32], m1 + mova [acq+48], m0 + paddw m4, m1 + paddw m5, m0 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop +.calc_avg_8_16: + mova m0, m5 + psrld m5, 16 + pslld m0, 16 + psrld m0, 16 + paddd m5, m0 + mova m0, m4 + psrld m0, 16 + pslld m4, 16 + psrld m4, 16 + paddd m0, m4 + paddd m5, m0 + jmp .calc_avg + +.w32: + pxor m0, m0 + mova [rsp ], m0 + mova [rsp+16], m0 + mova [rsp+32], m0 + mova [rsp+48], m0 + test wpadd, wpadd + jnz .w32_wpad +.w32_loop: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m4, [yq+16] + mova m3, m4 + punpcklbw m3, m3 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + punpckhbw m4, m4 + pmaddubsw m4, m2 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_loop + test hpadd, hpadd + jz .calc_avg_32 + jmp .w32_hpad_loop +.w32_wpad: + cmp wpadd, 2 + jl .w32_pad1 + je .w32_pad2 + cmp wpadd, 4 + jl .w32_pad3 + je .w32_pad4 + cmp wpadd, 6 + jl .w32_pad5 + je .w32_pad6 +.w32_pad7: + movd m1, [yq] + punpcklbw m1, m1 + punpcklqdq m1, m1 + pshufhw m1, m1, q3333 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + mova m0, m1 + punpckhqdq m0, m0 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad7 + jmp .w32_wpad_done +.w32_pad6: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + pshufhw m0, m1, q3333 + punpckhqdq m0, m0 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad6 + jmp .w32_wpad_done +.w32_pad5: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + mova m5, [rsp] + paddw m5, m1 + mova [rsp ], m5 + punpckhbw m0, m0 + punpcklqdq m0, m0 + pshufhw m0, m0, q3333 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + punpckhqdq m3, m3 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad5 + jmp .w32_wpad_done +.w32_pad4: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, m0 + pshufhw m3, m3, q3333 + punpckhqdq m3, m3 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad4 + jmp .w32_wpad_done +.w32_pad3: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + movd m3, [yq+16] + punpcklbw m3, m3 + punpcklqdq m3, m3 + pshufhw m3, m3, q3333 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + mova m4, m3 + punpckhqdq m4, m4 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad3 + jmp .w32_wpad_done +.w32_pad2: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m3, [yq+16] + punpcklbw m3, m3 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + pshufhw m4, m3, q3333 + punpckhqdq m4, m4 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad2 + jmp .w32_wpad_done +.w32_pad1: + mova m0, [yq] + mova m1, m0 + punpcklbw m1, m1 + pmaddubsw m1, m2 + mova [acq], m1 + paddw m5, m1, [rsp] + mova [rsp ], m5 + punpckhbw m0, m0 + pmaddubsw m0, m2 + mova [acq+16], m0 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova m4, [yq+16] + mova m3, m4 + punpcklbw m3, m3 + pmaddubsw m3, m2 + mova [acq+32], m3 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + punpckhbw m4, m4 + punpcklqdq m4, m4 + pshufhw m4, m4, q3333 + pmaddubsw m4, m2 + mova [acq+48], m4 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + lea yq, [yq+strideq] + add acq, 64 + sub hd, 1 + jg .w32_pad1 +.w32_wpad_done: + test hpadd, hpadd + jz .calc_avg_32 +.w32_hpad_loop: + mova [acq], m1 + mova [acq+16], m0 + paddw m5, m1, [rsp] + mova [rsp ], m5 + paddw m5, m0, [rsp+16] + mova [rsp+16], m5 + mova [acq+32], m3 + mova [acq+48], m4 + paddw m5, m3, [rsp+32] + mova [rsp+32], m5 + paddw m5, m4, [rsp+48] + mova [rsp+48], m5 + add acq, 64 + sub hpadd, 1 + jg .w32_hpad_loop + +%if ARCH_X86_64 + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak +%else + DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h +%endif + +.calc_avg_32: + mova m5, [rsp] + mova m0, m5 + psrld m5, 16 + pslld m0, 16 + psrld m0, 16 + paddd m5, m0 + mova m0, [rsp+16] + mova m3, m0 + psrld m0, 16 + pslld m3, 16 + psrld m3, 16 + paddd m0, m3 + paddd m5, m0 + mova m0, [rsp+32] + mova m3, m0 + psrld m0, 16 + pslld m3, 16 + psrld m3, 16 + paddd m0, m3 + mova m1, [rsp+48] + mova m3, m1 + psrld m1, 16 + pslld m3, 16 + psrld m3, 16 + paddd m1, m3 + paddd m1, m0 + paddd m5, m1 +.calc_avg: + movd szd, m6 + psrad m6, 1 + tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); + paddd m5, m6 + movd m1, r1d + pshufd m0, m5, q2301 + paddd m0, m5 + pshufd m5, m0, q1032 + paddd m0, m5 + psrad m0, m1 ; sum >>= log2sz; + packssdw m0, m0 + RELOAD_ACQ_32 acq ; ac = ac_orig +.sub_loop: + mova m1, [acq] + psubw m1, m0 + mova [acq], m1 + add acq, 16 + sub szd, 8 + jg .sub_loop + RET + +; %1 simd register that hold the mask and will hold the result +; %2 simd register that holds the "true" values +; %3 location of the "false" values (simd register/memory) +%macro BLEND 3 ; mask, true, false + pand %2, %1 + pandn %1, %3 + por %1, %2 +%endmacro + +%macro PAETH 2 ; top, ldiff + pavgb m1, m%1, m3 + pxor m0, m%1, m3 + pand m0, m4 + psubusb m2, m5, m1 + psubb m1, m0 + psubusb m1, m5 + por m1, m2 + paddusb m1, m1 + por m1, m0 ; min(tldiff, 255) + psubusb m2, m5, m3 + psubusb m0, m3, m5 + por m2, m0 ; tdiff +%ifnum %2 + pminub m2, m%2 + pcmpeqb m0, m%2, m2 ; ldiff <= tdiff +%else + mova m0, %2 + pminub m2, m0 + pcmpeqb m0, m2 +%endif + pminub m1, m2 + pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff + mova m2, m3 + BLEND m0, m2, m%1 + BLEND m1, m0, m5 +%endmacro + +cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h +%define base r5-ipred_paeth_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + pxor m0, m0 + movd m5, [tlq] + pshufb m5, m0 + LEA r5, ipred_paeth_ssse3_table + movsxd wq, [r5+wq*4] + movddup m4, [base+ipred_paeth_shuf] + add wq, r5 + jmp wq +.w4: + movd m6, [tlq+1] ; top + pshufd m6, m6, q0000 + lea r3, [strideq*3] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 ; ldiff +.w4_loop: + sub tlq, 4 + movd m3, [tlq] + mova m1, [base+ipred_h_shuf] + pshufb m3, m1 ; left + PAETH 6, 7 + movd [dstq ], m1 + pshuflw m0, m1, q1032 + movd [dstq+strideq ], m0 + punpckhqdq m1, m1 + movd [dstq+strideq*2], m1 + psrlq m1, 32 + movd [dstq+r3 ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4_loop + RET +ALIGN function_align +.w8: + movddup m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w8_loop: + sub tlq, 2 + movd m3, [tlq] + pshufb m3, [base+ipred_paeth_shuf] + PAETH 6, 7 + movq [dstq ], m1 + movhps [dstq+strideq], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 +.w16_loop: + sub tlq, 1 + movd m3, [tlq] + pxor m1, m1 + pshufb m3, m1 + PAETH 6, 7 + mova [dstq], m1 + add dstq, strideq + sub hd, 1 + jg .w16_loop + RET +ALIGN function_align +.w32: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp ], m6 + mova [rsp+16], m7 + movu m6, [tlq+17] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+32], m6 +.w32_loop: + dec tlq + movd m3, [tlq] + pxor m1, m1 + pshufb m3, m1 + mova m6, [rsp] + PAETH 6, [rsp+16] + mova [dstq ], m1 + mova m6, [rsp+32] + PAETH 6, 7 + mova [dstq+16], m1 + add dstq, strideq + dec hd + jg .w32_loop + RET +ALIGN function_align +.w64: + movu m6, [tlq+1] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp ], m6 + mova [rsp+16], m7 + movu m6, [tlq+17] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+32], m6 + mova [rsp+48], m7 + movu m6, [tlq+33] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+64], m6 + mova [rsp+80], m7 + movu m6, [tlq+49] + psubusb m7, m5, m6 + psubusb m0, m6, m5 + por m7, m0 + mova [rsp+96], m6 +.w64_loop: + dec tlq + movd m3, [tlq] + pxor m1, m1 + pshufb m3, m1 + mova m6, [rsp] + PAETH 6, [rsp+16] + mova [dstq ], m1 + mova m6, [rsp+32] + PAETH 6, [rsp+48] + mova [dstq+16], m1 + mova m6, [rsp+64] + PAETH 6, [rsp+80] + mova [dstq+32], m1 + mova m6, [rsp+96] + PAETH 6, 7 + mova [dstq+48], m1 + add dstq, strideq + dec hd + jg .w64_loop + RET + + +%macro FILTER 4 ;dst, src, tmp, shuf +%ifnum %4 + pshufb m%2, m%4 +%else + pshufb m%2, %4 +%endif + pshufd m%1, m%2, q0000 ;p0 p1 + pmaddubsw m%1, m2 + pshufd m%3, m%2, q1111 ;p2 p3 + pmaddubsw m%3, m3 + paddw m%1, [base+pw_8] + paddw m%1, m%3 + pshufd m%3, m%2, q2222 ;p4 p5 + pmaddubsw m%3, m4 + paddw m%1, m%3 + pshufd m%3, m%2, q3333 ;p6 __ + pmaddubsw m%3, m5 + paddw m%1, m%3 + psraw m%1, 4 + packuswb m%1, m%1 +%endmacro + +cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter +%define base r6-$$ + LEA r6, $$ + tzcnt wd, wm +%ifidn filterd, filterm + movzx filterd, filterb +%else + movzx filterd, byte filterm +%endif + shl filterd, 6 + lea filterq, [base+filter_intra_taps+filterq] + movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 + movsxd wq, [base+ipred_filter_ssse3_table+wq*4] + mova m2, [filterq+16*0] + mova m3, [filterq+16*1] + mova m4, [filterq+16*2] + mova m5, [filterq+16*3] + lea wq, [base+ipred_filter_ssse3_table+wq] + mov hd, hm + jmp wq +.w4: + mova m1, [base+filter_shuf1] + sub tlq, 3 + sub tlq, hq + jmp .w4_loop_start +.w4_loop: + movd m0, [tlq+hq] + punpckldq m0, m6 + lea dstq, [dstq+strideq*2] +.w4_loop_start: + FILTER 6, 0, 7, 1 + movd [dstq+strideq*0], m6 + pshuflw m6, m6, q1032 + movd [dstq+strideq*1], m6 + sub hd, 2 + jg .w4_loop + RET + +ALIGN function_align +.w8: + movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 + sub tlq, 5 + sub tlq, hq + +.w8_loop: + FILTER 7, 0, 1, [base+filter_shuf1] + punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + FILTER 0, 6, 1, [base+filter_shuf2] + + punpckldq m6, m7, m0 + movq [dstq+strideq*0], m6 + punpckhqdq m6, m6 + movq [dstq+strideq*1], m6 + + movd m0, [tlq+hq] ;_ 6 5 0 + punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 + + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET + +ALIGN function_align +.w16: + movu m6, [tlq+1] ;top row + sub tlq, 5 + sub tlq, hq + +.w16_loop: + FILTER 7, 0, 1, [base+filter_shuf1] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+4+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + + FILTER 7, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+8+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + movd [dstq+12+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + mova [dstq+strideq*1], m6 + + movd m0, [tlq+hq] ;_ 6 5 0 + punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 + + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w16_loop + RET + +ALIGN function_align +.w32: + movu m6, [tlq+1] ;top row + lea filterq, [tlq+17] + sub tlq, 5 + sub tlq, hq + +.w32_loop: + FILTER 7, 0, 1, [base+filter_shuf1] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+4+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + + FILTER 7, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+8+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + movu m1, [filterq] + punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ + punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+12+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + mova [dstq+strideq*1], m6 + + mova m6, m1 + + FILTER 7, 0, 6, [base+filter_shuf2] + punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+16+strideq*0], m7 + psrlq m7, 32 + palignr m7, m1, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+20+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + + FILTER 7, 0, 1, [base+filter_shuf2] + punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 + movd [dstq+24+strideq*0], m7 + psrlq m7, 32 + palignr m7, m6, 4 + + FILTER 6, 0, 1, [base+filter_shuf2] + movd [dstq+28+strideq*0], m6 + psrlq m6, 32 + palignr m6, m7, 4 + mova [dstq+16+strideq*1], m6 + + mova m6, [dstq+strideq*1] + movd m0, [tlq+hq] ;_ 6 5 0 + punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 + lea filterq, [dstq+16+strideq*1] + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET diff --git a/third_party/dav1d/src/x86/itx.h b/third_party/dav1d/src/x86/itx.h new file mode 100644 index 0000000000..478eb6c6b6 --- /dev/null +++ b/third_party/dav1d/src/x86/itx.h @@ -0,0 +1,363 @@ +/* + * Copyright © 2018-2023, VideoLAN and dav1d authors + * Copyright © 2018-2023, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/itx.h" + +#define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix + +#define decl_itx2_fns(w, h, opt) \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) + +#define decl_itx12_fns(w, h, opt) \ +decl_itx2_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) + +#define decl_itx16_fns(w, h, opt) \ +decl_itx12_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) + +#define decl_itx17_fns(w, h, opt) \ +decl_itx16_fns(w, h, opt); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) + +#define decl_itx_fns(ext) \ +decl_itx17_fns( 4, 4, ext); \ +decl_itx16_fns( 4, 8, ext); \ +decl_itx16_fns( 4, 16, ext); \ +decl_itx16_fns( 8, 4, ext); \ +decl_itx16_fns( 8, 8, ext); \ +decl_itx16_fns( 8, 16, ext); \ +decl_itx2_fns ( 8, 32, ext); \ +decl_itx16_fns(16, 4, ext); \ +decl_itx16_fns(16, 8, ext); \ +decl_itx12_fns(16, 16, ext); \ +decl_itx2_fns (16, 32, ext); \ +decl_itx2_fns (32, 8, ext); \ +decl_itx2_fns (32, 16, ext); \ +decl_itx2_fns (32, 32, ext); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, ext)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, ext)) + + +#define decl_itx2_bpc_fns(w, h, bpc, opt) \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_identity_##w##x##h, bpc, opt)) + +#define decl_itx12_bpc_fns(w, h, bpc, opt) \ +decl_itx2_bpc_fns(w, h, bpc, opt); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_dct_##w##x##h, bpc, opt)) + +#define decl_itx16_bpc_fns(w, h, bpc, opt) \ +decl_itx12_bpc_fns(w, h, bpc, opt); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_adst_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_adst_##w##x##h, bpc, opt)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, bpc, opt)) + +#define decl_itx_bpc_fns(bpc, ext) \ +decl_itx16_bpc_fns( 4, 4, bpc, ext); \ +decl_itx16_bpc_fns( 4, 8, bpc, ext); \ +decl_itx16_bpc_fns( 4, 16, bpc, ext); \ +decl_itx16_bpc_fns( 8, 4, bpc, ext); \ +decl_itx16_bpc_fns( 8, 8, bpc, ext); \ +decl_itx16_bpc_fns( 8, 16, bpc, ext); \ +decl_itx2_bpc_fns ( 8, 32, bpc, ext); \ +decl_itx16_bpc_fns(16, 4, bpc, ext); \ +decl_itx16_bpc_fns(16, 8, bpc, ext); \ +decl_itx12_bpc_fns(16, 16, bpc, ext); \ +decl_itx2_bpc_fns (16, 32, bpc, ext); \ +decl_itx2_bpc_fns (32, 8, bpc, ext); \ +decl_itx2_bpc_fns (32, 16, bpc, ext); \ +decl_itx2_bpc_fns (32, 32, bpc, ext); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_16x64, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_32x64, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x16, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \ +decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext)) + +decl_itx_fns(avx512icl); +decl_itx_bpc_fns(10, avx512icl); +decl_itx_fns(avx2); +decl_itx_bpc_fns(10, avx2); +decl_itx_bpc_fns(12, avx2); +decl_itx_fns(sse4); +decl_itx_fns(ssse3); +decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2); +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2)); + +static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) { +#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) + +#define assign_itx1_fn(pfx, w, h, ext) \ + assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) + +#define assign_itx2_fn(pfx, w, h, ext) \ + assign_itx1_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext) + +#define assign_itx12_fn(pfx, w, h, ext) \ + assign_itx2_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \ + assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \ + assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext) + +#define assign_itx16_fn(pfx, w, h, ext) \ + assign_itx12_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \ + assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \ + assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \ + assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext) + +#define assign_itx17_fn(pfx, w, h, ext) \ + assign_itx16_fn(pfx, w, h, ext); \ + assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) + + +#define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \ + c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ + BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext) + +#define assign_itx1_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx_bpc_fn(pfx, w, h, dct_dct, DCT_DCT, bpc, ext) + +#define assign_itx2_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx1_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_identity, IDTX, bpc, ext) + +#define assign_itx12_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx2_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_adst, ADST_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, dct_identity, H_DCT, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_dct, DCT_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_adst, ADST_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_dct, V_DCT, bpc, ext) + +#define assign_itx16_bpc_fn(pfx, w, h, bpc, ext) \ + assign_itx12_bpc_fn(pfx, w, h, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, adst_identity, H_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, flipadst_identity, H_FLIPADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_adst, V_ADST, bpc, ext); \ + assign_itx_bpc_fn(pfx, w, h, identity_flipadst, V_FLIPADST, bpc, ext) + + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + + assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, sse2); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + +#if BITDEPTH == 8 + assign_itx16_fn(, 4, 4, ssse3); + assign_itx16_fn(R, 4, 8, ssse3); + assign_itx16_fn(R, 8, 4, ssse3); + assign_itx16_fn(, 8, 8, ssse3); + assign_itx16_fn(R, 4, 16, ssse3); + assign_itx16_fn(R, 16, 4, ssse3); + assign_itx16_fn(R, 8, 16, ssse3); + assign_itx16_fn(R, 16, 8, ssse3); + assign_itx12_fn(, 16, 16, ssse3); + assign_itx2_fn (R, 8, 32, ssse3); + assign_itx2_fn (R, 32, 8, ssse3); + assign_itx2_fn (R, 16, 32, ssse3); + assign_itx2_fn (R, 32, 16, ssse3); + assign_itx2_fn (, 32, 32, ssse3); + assign_itx1_fn (R, 16, 64, ssse3); + assign_itx1_fn (R, 32, 64, ssse3); + assign_itx1_fn (R, 64, 16, ssse3); + assign_itx1_fn (R, 64, 32, ssse3); + assign_itx1_fn ( , 64, 64, ssse3); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; + +#if BITDEPTH == 16 + if (bpc == 10) { + assign_itx16_fn(, 4, 4, sse4); + assign_itx16_fn(R, 4, 8, sse4); + assign_itx16_fn(R, 4, 16, sse4); + assign_itx16_fn(R, 8, 4, sse4); + assign_itx16_fn(, 8, 8, sse4); + assign_itx16_fn(R, 8, 16, sse4); + assign_itx16_fn(R, 16, 4, sse4); + assign_itx16_fn(R, 16, 8, sse4); + assign_itx12_fn(, 16, 16, sse4); + assign_itx2_fn (R, 8, 32, sse4); + assign_itx2_fn (R, 32, 8, sse4); + assign_itx2_fn (R, 16, 32, sse4); + assign_itx2_fn (R, 32, 16, sse4); + assign_itx2_fn (, 32, 32, sse4); + assign_itx1_fn (R, 16, 64, sse4); + assign_itx1_fn (R, 32, 64, sse4); + assign_itx1_fn (R, 64, 16, sse4); + assign_itx1_fn (R, 64, 32, sse4); + assign_itx1_fn (, 64, 64, sse4); + } +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2); + +#if BITDEPTH == 8 + assign_itx16_fn( , 4, 4, avx2); + assign_itx16_fn(R, 4, 8, avx2); + assign_itx16_fn(R, 4, 16, avx2); + assign_itx16_fn(R, 8, 4, avx2); + assign_itx16_fn( , 8, 8, avx2); + assign_itx16_fn(R, 8, 16, avx2); + assign_itx2_fn (R, 8, 32, avx2); + assign_itx16_fn(R, 16, 4, avx2); + assign_itx16_fn(R, 16, 8, avx2); + assign_itx12_fn( , 16, 16, avx2); + assign_itx2_fn (R, 16, 32, avx2); + assign_itx1_fn (R, 16, 64, avx2); + assign_itx2_fn (R, 32, 8, avx2); + assign_itx2_fn (R, 32, 16, avx2); + assign_itx2_fn ( , 32, 32, avx2); + assign_itx1_fn (R, 32, 64, avx2); + assign_itx1_fn (R, 64, 16, avx2); + assign_itx1_fn (R, 64, 32, avx2); + assign_itx1_fn ( , 64, 64, avx2); +#else + if (bpc == 10) { + assign_itx16_bpc_fn( , 4, 4, 10, avx2); + assign_itx16_bpc_fn(R, 4, 8, 10, avx2); + assign_itx16_bpc_fn(R, 4, 16, 10, avx2); + assign_itx16_bpc_fn(R, 8, 4, 10, avx2); + assign_itx16_bpc_fn( , 8, 8, 10, avx2); + assign_itx16_bpc_fn(R, 8, 16, 10, avx2); + assign_itx2_bpc_fn (R, 8, 32, 10, avx2); + assign_itx16_bpc_fn(R, 16, 4, 10, avx2); + assign_itx16_bpc_fn(R, 16, 8, 10, avx2); + assign_itx12_bpc_fn( , 16, 16, 10, avx2); + assign_itx2_bpc_fn (R, 16, 32, 10, avx2); + assign_itx1_bpc_fn (R, 16, 64, 10, avx2); + assign_itx2_bpc_fn (R, 32, 8, 10, avx2); + assign_itx2_bpc_fn (R, 32, 16, 10, avx2); + assign_itx2_bpc_fn ( , 32, 32, 10, avx2); + assign_itx1_bpc_fn (R, 32, 64, 10, avx2); + assign_itx1_bpc_fn (R, 64, 16, 10, avx2); + assign_itx1_bpc_fn (R, 64, 32, 10, avx2); + assign_itx1_bpc_fn ( , 64, 64, 10, avx2); + } else { + assign_itx16_bpc_fn( , 4, 4, 12, avx2); + assign_itx16_bpc_fn(R, 4, 8, 12, avx2); + assign_itx16_bpc_fn(R, 4, 16, 12, avx2); + assign_itx16_bpc_fn(R, 8, 4, 12, avx2); + assign_itx16_bpc_fn( , 8, 8, 12, avx2); + assign_itx16_bpc_fn(R, 8, 16, 12, avx2); + assign_itx2_bpc_fn (R, 8, 32, 12, avx2); + assign_itx16_bpc_fn(R, 16, 4, 12, avx2); + assign_itx16_bpc_fn(R, 16, 8, 12, avx2); + assign_itx12_bpc_fn( , 16, 16, 12, avx2); + assign_itx2_bpc_fn (R, 32, 8, 12, avx2); + assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2); + assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2); + assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2); + } +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + +#if BITDEPTH == 8 + assign_itx16_fn( , 4, 4, avx512icl); // no wht + assign_itx16_fn(R, 4, 8, avx512icl); + assign_itx16_fn(R, 4, 16, avx512icl); + assign_itx16_fn(R, 8, 4, avx512icl); + assign_itx16_fn( , 8, 8, avx512icl); + assign_itx16_fn(R, 8, 16, avx512icl); + assign_itx2_fn (R, 8, 32, avx512icl); + assign_itx16_fn(R, 16, 4, avx512icl); + assign_itx16_fn(R, 16, 8, avx512icl); + assign_itx12_fn( , 16, 16, avx512icl); + assign_itx2_fn (R, 16, 32, avx512icl); + assign_itx1_fn (R, 16, 64, avx512icl); + assign_itx2_fn (R, 32, 8, avx512icl); + assign_itx2_fn (R, 32, 16, avx512icl); + assign_itx2_fn ( , 32, 32, avx512icl); + assign_itx1_fn (R, 32, 64, avx512icl); + assign_itx1_fn (R, 64, 16, avx512icl); + assign_itx1_fn (R, 64, 32, avx512icl); + assign_itx1_fn ( , 64, 64, avx512icl); +#else + if (bpc == 10) { + assign_itx16_bpc_fn( , 8, 8, 10, avx512icl); + assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl); + assign_itx2_bpc_fn (R, 8, 32, 10, avx512icl); + assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl); + assign_itx12_bpc_fn( , 16, 16, 10, avx512icl); + assign_itx2_bpc_fn (R, 16, 32, 10, avx512icl); + assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl); + assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl); + assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl); + assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl); + } +#endif +#endif +} diff --git a/third_party/dav1d/src/x86/itx16_avx2.asm b/third_party/dav1d/src/x86/itx16_avx2.asm new file mode 100644 index 0000000000..2315ec1e47 --- /dev/null +++ b/third_party/dav1d/src/x86/itx16_avx2.asm @@ -0,0 +1,8599 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright © 2021, Matthias Dressel +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 + dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 +idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 +idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5 +iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 +idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 +iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 +pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 +idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 +idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 + +%macro COEF_PAIR 2-3 0 +pd_%1_%2: dd %1, %1, %2, %2 +%define pd_%1 (pd_%1_%2 + 4*0) +%define pd_%2 (pd_%1_%2 + 4*2) +%if %3 +dd -%2, -%2 +%define pd_%2_m%2 pd_%2 +%endif +%endmacro + +COEF_PAIR 201, 995 +COEF_PAIR 401, 1931 +COEF_PAIR 799, 3406 +COEF_PAIR 1380, 601 +COEF_PAIR 1751, 2440 +COEF_PAIR 2598, 1189 +COEF_PAIR 2751, 2106 +COEF_PAIR 2896, 1567, 1 +COEF_PAIR 2896, 3784, 1 +COEF_PAIR 3035, 3513 +COEF_PAIR 3166, 3920 +COEF_PAIR 3703, 3290 +COEF_PAIR 3857, 4052 +COEF_PAIR 4017, 2276 +COEF_PAIR 4076, 3612 +COEF_PAIR 4091, 3973 + +pd_8: dd 8 +pd_m601: dd -601 +pd_m1189: dd -1189 +pd_m1380: dd -1380 +pd_m2106: dd -2106 +pd_m2598: dd -2598 +pd_m2751: dd -2751 +pd_m3344: dd -3344 +pd_1024: dd 1024 +pd_1321: dd 1321 +pd_1448: dd 1448 +pd_1697: dd 1697 +pd_2482: dd 2482 +pd_3072: dd 3072 ; 1024 + 2048 +pd_3803: dd 3803 +pd_5119: dd 5119 ; 1024 + 4096 - 1 +pd_5120: dd 5120 ; 1024 + 4096 +pd_5793: dd 5793 +pd_6144: dd 6144 ; 2048 + 4096 +pd_17408: dd 17408 ; 1024 + 16384 + +pixel_10bpc_max: times 2 dw 0x03ff +pixel_12bpc_max: times 2 dw 0x0fff +dconly_10bpc: times 2 dw 0x7c00 +dconly_12bpc: times 2 dw 0x7000 +clip_18b_min: dd -0x20000 +clip_18b_max: dd 0x1ffff +clip_20b_min: dd -0x80000 +clip_20b_max: dd 0x7ffff + +idct64_mul_16bpc: +dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 +dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 +dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 +dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 + +cextern deint_shuf +cextern idct64_mul +cextern pw_1697x8 +cextern pw_1697x16 +cextern pw_1567_3784 +cextern pw_m1567_m3784 +cextern pw_m3784_1567 +cextern pw_2896_2896 +cextern pw_m2896_2896 +cextern pw_5 +cextern pw_2048 +cextern pw_4096 +cextern pw_8192 +cextern pw_16384 +cextern pw_2896x8 +cextern pd_2048 + +cextern idct_4x8_internal_8bpc_avx2.main +cextern idct_4x16_internal_8bpc_avx2.main +cextern idct_8x8_internal_8bpc_avx2.main +cextern idct_8x16_internal_8bpc_avx2.main +cextern idct_16x4_internal_8bpc_avx2.main +cextern idct_16x8_internal_8bpc_avx2.main +cextern idct_16x16_internal_8bpc_avx2.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1 +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal + +cextern iadst_4x4_internal_8bpc_avx2.main +cextern iadst_4x8_internal_8bpc_avx2.main_pass2 +cextern iadst_4x16_internal_8bpc_avx2.main2 +cextern iadst_8x4_internal_8bpc_avx2.main +cextern iadst_8x8_internal_8bpc_avx2.main_pass2 +cextern iadst_8x16_internal_8bpc_avx2.main +cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end +cextern iadst_16x4_internal_8bpc_avx2.main +cextern iadst_16x8_internal_8bpc_avx2.main +cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end +cextern iadst_16x16_internal_8bpc_avx2.main +cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end + +SECTION .text + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro IWHT4_1D_PACKED 0 + ; m0 = in0 in2, m1 = in1 in3 + psubd m2, m0, m1 ; t2 + paddd xm0, xm1 ; t0 + vpermq m2, m2, q3322 + vpermq m0, m0, q1100 + vpermq m1, m1, q3120 + psubd m3, m0, m2 + psrad m3, 1 + psubd m3, m1 ; t1 t3 + psubd m0, m3 ; ____ out0 + paddd m2, m3 ; out3 ____ +%endmacro + +INIT_YMM avx2 +cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax + mova xm0, [cq+16*0] + vinserti128 m0, [cq+16*2], 1 + mova xm1, [cq+16*1] + vinserti128 m1, [cq+16*3], 1 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + lea r6, [dstq+strideq*2] + psrad m0, 2 + psrad m1, 2 + IWHT4_1D_PACKED + punpckhdq m0, m3 + punpckldq m3, m2 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x33 + packssdw m0, m3 + vextracti128 xm2, m0, 1 + punpckhdq xm1, xm0, xm2 ; out2 out1 + punpckldq xm0, xm2 ; out3 out0 + movq xm2, [r6 +strideq*1] + movhps xm2, [dstq+strideq*0] + movq xm3, [r6 +strideq*0] + movhps xm3, [dstq+strideq*1] +%ifidn bdmaxd, bdmaxm + movd xm5, bdmaxd + vpbroadcastw xm5, xm5 +%else ; win64: load from stack + vpbroadcastw xm5, bdmaxm +%endif + paddsw xm0, xm2 + paddsw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movq [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm0 + RET + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 1 = packed, 2 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else +%if %9 & 1 + vbroadcasti128 m%3, [pd_%8] +%else + vpbroadcastd m%3, [pd_%8] +%endif + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else +%if %9 & 1 + vbroadcasti128 m%5, [pd_%7] +%else + vpbroadcastd m%5, [pd_%7] +%endif + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 2 + psubd m%4, m%6, m%4 + psubd m%2, m%4, m%2 +%else +%ifnum %6 + paddd m%4, m%6 +%endif + paddd m%2, m%4 +%endif +%ifnum %6 + paddd m%1, m%6 +%endif + psubd m%1, m%3 +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth +cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_%5bpc) + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%4_internal_%5bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else +%if %3 + add eobd, %3 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x4, %3 +%ifidn %1_%2, dct_dct + vpbroadcastd xm2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 +.dconly2: + add r6d, 128 + sar r6d, 8 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm2 + vpbroadcastw xm0, xm0 +.dconly_loop: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + paddsw xm1, xm0 + psubusw xm1, xm2 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + WRAP_XMM RET +%else + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly +%endif +%endif +%endmacro + +%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd + ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1 + punpckhqdq m%3, m%2, m%1 ; t3 t2 + punpcklqdq m%2, m%1 ; t0 t1 + paddd m%1, m%2, m%3 ; out0 out1 + psubd m%2, m%3 ; out3 out2 +%endmacro + +%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd + vpbroadcastd m%5, [pw_m3784_1567] + punpckhwd m%3, m%2, m%1 + vpbroadcastd m%4, [pw_1567_3784] + punpcklwd m%2, m%1 + vpbroadcastd m%1, [pw_m2896_2896] + pmaddwd m%5, m%3 + pmaddwd m%3, m%4 + vpbroadcastd m%4, [pw_2896_2896] + pmaddwd m%1, m%2 + pmaddwd m%2, m%4 + REPX {paddd x, m%6}, m%5, m%3, m%1, m%2 + REPX {psrad x, 12 }, m%5, m%3, m%1, m%2 + packssdw m%3, m%5 ; t3 t2 + packssdw m%2, m%1 ; t0 t1 + paddsw m%1, m%2, m%3 ; out0 out1 + psubsw m%2, m%3 ; out3 out2 +%endmacro + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, identity +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst + +cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call .main + vbroadcasti128 m2, [idct4_shuf] + packssdw m0, m1 + pshufb m0, m2 + jmp tx2q +.pass2: + vextracti128 xm1, m0, 1 + WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 + packssdw xm5, xm5 ; pw_2048 + pmulhrsw xm0, xm5 + pmulhrsw xm1, xm5 + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movq xm3, [r6 +strideq*1] + movhps xm3, [r6 +strideq*0] + vpbroadcastd xm5, [pixel_10bpc_max] + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm1 + RET +ALIGN function_align +.main: + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m5, [pd_2048] +.main2: + IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5 + ret + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +%macro IADST4_1D 0 + vpbroadcastd m5, [pd_1321] + vpbroadcastd m7, [pd_2482] + pmulld m4, m0, m5 ; 1321*in0 + pmulld m6, m3, m7 ; 2482*in3 + paddd m4, m6 ; 1321*in0 + 2482*in3 + pmulld m6, m0, m7 ; 2482*in0 + paddd m0, m3 ; in0 + in3 + paddd m7, m5 ; pd_3803 + pmulld m5, m2 ; 1321*in2 + pmulld m3, m7 ; 3803*in3 + pmulld m7, m2 ; 3803*in2 + psubd m2, m0 ; in2 - in0 - in3 + vpbroadcastd m0, [pd_m3344] + pmulld m1, m0 ; -t3 + pmulld m2, m0 ; out2 (unrounded) + psubd m6, m5 ; 2482*in0 - 1321*in2 + paddd m4, m7 ; t0 + psubd m6, m3 ; t1 + paddd m3, m4, m6 + psubd m4, m1 ; out0 (unrounded) + psubd m6, m1 ; out1 (unrounded) + paddd m3, m1 ; out3 (unrounded) +%endmacro + +cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call .main + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass1_end: + vpbroadcastd m5, [pd_2048] + mova m2, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + vpermd m0, m2, m0 + psrld m2, 4 + pshufb m0, m2 +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + jmp tx2q +.pass2: + lea r6, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal_8bpc).main +.end: + vpbroadcastd xm4, [pw_2048] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm5, [pixel_10bpc_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET +ALIGN function_align +.main: + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] +%if WIN64 + movaps [rsp+16], xmm6 + movaps [rsp+32], xmm7 +%endif +.main2: + WRAP_XMM IADST4_1D + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10bpc).main + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_10bpc).pass1_end +.pass2: + lea r6, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal_8bpc).main + vpbroadcastd xm4, [pw_2048] + movq xm3, [dstq+strideq*1] + movhps xm3, [dstq+strideq*0] + lea r6, [dstq+strideq*2] + movq xm2, [r6 +strideq*1] + movhps xm2, [r6 +strideq*0] + vpbroadcastd xm5, [pixel_10bpc_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm1 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movq [r6 +strideq*1], xm0 + RET + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 + vpbroadcastd m1, [pd_5793] + pmulld m0, m1, [cq+32*0] + pmulld m1, [cq+32*1] + vpbroadcastd m5, [pd_2048] + mova m3, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + vpermd m0, m3, m0 + psrld m3, 4 + pshufb m0, m3 + jmp tx2q +.pass2: + vpbroadcastd m1, [pw_1697x8] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + pmulhrsw m1, m0 + paddsw m0, m1 + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm4, [pixel_10bpc_max] + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pxor m5, m5 + mova [cq+32*0], m5 + mova [cq+32*1], m5 + vextracti128 xm1, m0, 1 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm5 + pmaxsw xm1, xm5 + pminsw xm0, xm4 + pminsw xm1, xm4 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET + +INV_TXFM_4X4_FN dct, dct, 12 +INV_TXFM_4X4_FN dct, identity, 12 +INV_TXFM_4X4_FN dct, adst, 12 +INV_TXFM_4X4_FN dct, flipadst, 12 + +cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(idct_4x4_internal_10bpc).main + mova m3, [idct4_12_shuf] + mova m4, [idct4_12_shuf2] + vpermd m2, m4, m1 + vpermd m1, m3, m0 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 +.pass2: + vpbroadcastd m5, [pd_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct_4x4_internal_10bpc).main2 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_4x4_internal_12bpc).end + +INV_TXFM_4X4_FN adst, dct, 12 +INV_TXFM_4X4_FN adst, adst, 12 +INV_TXFM_4X4_FN adst, flipadst, 12 +INV_TXFM_4X4_FN adst, identity, 12 + +cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10bpc).main + vinserti128 m1, m4, xm6, 1 + vinserti128 m2, xm3, 1 +.pass1_end: + mova m3, [itx4_shuf] + vpbroadcastd m5, [pd_1024] + psrad m1, 1 + psrad m2, 1 + vpermd m1, m3, m1 + vpermd m2, m3, m2 + paddd m1, m5 + paddd m2, m5 + psrad m1, 11 + psrad m2, 11 +.pass1_end2: + vpbroadcastd m3, [clip_18b_min] + vpbroadcastd m4, [clip_18b_max] + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pmaxsd m0, m3 + pmaxsd m1, m3 + pminsd m0, m4 + pminsd m1, m4 + jmp tx2q +.pass2: + call .main_pass2 + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass2_end: + vpbroadcastd m5, [pd_2048] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 +.end: +%if WIN64 + WIN64_RESTORE_XMM_INTERNAL + %assign xmm_regs_used 6 +%endif +.end2: + vpbroadcastd m4, [pw_16384] + movq xm2, [dstq+strideq*0] + movq xm3, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movhps xm2, [r6 +strideq*0] ; dst0 dst2 + movhps xm3, [r6 +strideq*1] ; dst1 dst3 + vpbroadcastd m5, [pixel_12bpc_max] + vinserti128 m2, xm3, 1 + psrad m0, 3 + psrad m1, 3 + packssdw m0, m1 ; t0 t2 t1 t3 + pmulhrsw m0, m4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw m0, m2 ; out0 out2 out1 out3 + pmaxsw m0, m4 + pminsw m0, m5 + vextracti128 xm1, m0, 1 ; out1 out3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movhps [r6 +strideq*1], xm1 + RET +.main_pass2: + vextracti128 xm3, m1, 1 + mova xm2, xm1 + vextracti128 xm1, m0, 1 + jmp m(iadst_4x4_internal_10bpc).main2 + +INV_TXFM_4X4_FN flipadst, dct, 12 +INV_TXFM_4X4_FN flipadst, adst, 12 +INV_TXFM_4X4_FN flipadst, flipadst, 12 +INV_TXFM_4X4_FN flipadst, identity, 12 + +cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10bpc).main + vinserti128 m1, m3, xm2, 1 + vinserti128 m2, m6, xm4, 1 + jmp m(iadst_4x4_internal_12bpc).pass1_end +.pass2: + call m(iadst_4x4_internal_12bpc).main_pass2 + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_12bpc).pass2_end + +INV_TXFM_4X4_FN identity, dct, 12 +INV_TXFM_4X4_FN identity, adst, 12 +INV_TXFM_4X4_FN identity, flipadst, 12 +INV_TXFM_4X4_FN identity, identity, 12 + +cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + mova m2, [itx4_shuf] + vpbroadcastd m3, [pd_1697] + vpermd m0, m2, [cq+32*0] + vpermd m2, m2, [cq+32*1] + vpbroadcastd m5, [pd_2048] + pmulld m1, m3, m0 + pmulld m3, m2 + paddd m1, m5 + paddd m3, m5 + psrad m1, 12 + psrad m3, 12 + paddd m1, m0 + paddd m2, m3 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + vpbroadcastd m3, [pd_5793] + vpbroadcastd m5, [pd_2048] + pmulld m0, m3 + pmulld m1, m3 + paddd m0, m5 ; 2048 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + jmp m(iadst_4x4_internal_12bpc).end + +%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x8, %3 +%ifidn %1_%2, dct_dct + vpbroadcastd xm2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2 +%else + jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly +%endif +%endif +%endmacro + +%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3 + vpbroadcastd m%5, [pd_2896] + pmulld m%1, m%5 + pmulld m%3, m%5 + paddd m%1, m%8 + paddd m%5, m%1, m%3 + psubd m%1, m%3 + psrad m%5, 12 ; t0 + psrad m%1, 12 ; t1 + psubd m%3, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%5, m%4 + psubd m%4, m%5, m%4 +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m3, [pd_2896] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, m3, [cq+32*3] + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp tx2q +.pass2: + packssdw m0, m2 + packssdw m1, m3 + lea r6, [deint_shuf+128] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m2 ; 2 3 + punpckldq m0, m2 ; 0 1 + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + call m(idct_4x8_internal_8bpc).main + vpbroadcastd xm4, [pw_2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 3 2 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_10bpc_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movhps [r6 +strideq*2], xm3 + movq [r6 +r3 ], xm3 + RET + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10bpc).main + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass1_end: + REPX {psrad x, 12}, m0, m1, m2, m3 + jmp tx2q +.pass2: + call .pass2_main + mova xm4, [pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 +.end: + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 2 3 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 6 7 + vpbroadcastd xm5, [pixel_10bpc_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET +ALIGN function_align +.pass2_main: + packssdw m0, m2 + packssdw m1, m3 + lea r6, [deint_shuf+128] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpckhdq m5, m4, m0 + punpckldq m4, m0 + vextracti128 xm2, m4, 1 ; 4 5 + vextracti128 xm3, m5, 1 ; 6 7 + pshufd xm4, xm4, q1032 ; 1 0 + pshufd xm5, xm5, q1032 ; 3 2 + jmp m(iadst_4x8_internal_8bpc).main_pass2 +ALIGN function_align +.main: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.main2: + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m2, [cq+16*2] + vbroadcasti128 m3, [cq+16*5] + vbroadcasti128 m1, [cq+16*7] + vpbroadcastd m6, [pd_2896] + shufpd m0, m2, 0x0c ; 0 2 + shufpd m1, m3, 0x0c ; 7 5 + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m4, [cq+16*6] + vbroadcasti128 m5, [cq+16*1] + vbroadcasti128 m3, [cq+16*3] + vpbroadcastd m7, [pd_2048] + shufpd m2, m4, 0x0c ; 4 6 + shufpd m3, m5, 0x0c ; 3 1 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 +.main3: + ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1 + ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1 + psubd m4, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + REPX {pmaxsd x, m8}, m4, m2, m0, m1 + REPX {pminsd x, m9}, m4, m2, m0, m1 + pxor m5, m5 + psubd m5, m4 + vpblendd m4, m2, 0xcc ; t4 t7 + vpblendd m2, m5, 0xcc ; t5 -t6 + ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784 + vpbroadcastd m5, [pd_2896] + vbroadcasti128 m6, [pw_2048_m2048] ; + + - - + punpckhqdq m3, m0, m1 + punpcklqdq m0, m1 + psubd m1, m0, m3 ; t2 t3 + paddd m0, m3 ; out0 -out7 + punpckhqdq m3, m4, m2 ; t7a t6a + punpcklqdq m4, m2 ; t5a t4a + psubd m2, m4, m3 ; t7 t6 + paddd m4, m3 ; out6 -out1 + REPX {pmaxsd x, m8}, m1, m2 + REPX {pminsd x, m9}, m1, m2 + vpblendd m3, m1, m2, 0xcc + shufpd m1, m2, 0x05 + pmulld m3, m5 + pmulld m5, m1 + psignd m0, m6 ; out0 out7 + psignd m4, m6 ; out6 out1 + paddd m3, m7 + psubd m2, m3, m5 + paddd m5, m3 + psrad m2, 12 ; out4 -out5 + psrad m5, 12 ; -out3 out2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10bpc).main + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m2, m5, m6 + paddd m3, m5, m4 + jmp m(iadst_4x8_internal_10bpc).pass1_end +.pass2: + call m(iadst_4x8_internal_10bpc).pass2_main + mova xm4, [pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*1] + movhps xm4, [dstq+strideq*0] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*1] + movhps xm6, [r6 +strideq*0] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm3, xm4 ; 1 0 + paddw xm2, xm5 ; 3 2 + paddw xm1, xm6 ; 5 4 + paddw xm0, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_10bpc_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 + REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0 + movhps [dstq+strideq*0], xm3 + movq [dstq+strideq*1], xm3 + movhps [dstq+strideq*2], xm2 + movq [dstq+r3 ], xm2 + movhps [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm1 + movhps [r6 +strideq*2], xm0 + movq [r6 +r3 ], xm0 + RET + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m3, [pd_2896] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, [cq+32*3] + vpbroadcastd m5, [pd_2048] + vpbroadcastd m4, [pd_5793] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m6, [pixel_10bpc_max] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + vpbroadcastd m4, [pw_4096] + packssdw m0, m2 + packssdw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m4 + pmulhrsw m0, m4 + punpckhdq m1, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + vpbroadcastq m4, [r6 +strideq*0] + vpbroadcastq m5, [r6 +strideq*1] + movq xm3, [dstq+strideq*2] + movhps xm3, [dstq+r3 ] + vpblendd m2, m4, 0x30 + vpblendd m2, m5, 0xc0 + vpbroadcastq m4, [r6 +strideq*2] + vpbroadcastq m5, [r6 +r3 ] + vpblendd m3, m4, 0x30 + vpblendd m3, m5, 0xc0 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 ; out0 out1 out4 out5 + paddw m1, m3 ; out2 out3 out6 out7 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m6 + pminsw m1, m6 + vextracti128 xm2, m0, 1 ; out4 out5 + vextracti128 xm3, m1, 1 ; out6 out7 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + ret + +INV_TXFM_4X8_FN dct, dct, 12 +INV_TXFM_4X8_FN dct, identity, 12 +INV_TXFM_4X8_FN dct, adst, 12 +INV_TXFM_4X8_FN dct, flipadst, 12 + +cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(idct_4x8_internal_10bpc).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + ; transpose & interleave + pshufd m0, m0, q1320 + pshufd m1, m1, q1320 + pshufd m2, m2, q1320 + pshufd m3, m3, q1320 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + vpermq m0, m0, q3102 + vpermq m2, m2, q3102 + vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved) + vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved) + vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) + vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) + vpbroadcastd m7, [pd_2048] + call m(idct_8x4_internal_10bpc).main + psubd m3, m0, m4 ; out7 out6 + paddd m0, m4 ; out0 out1 + paddd m1, m2, m5 ; out3 out2 + psubd m2, m5 ; out4 out5 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x8_internal_12bpc).end + +INV_TXFM_4X8_FN adst, dct, 12 +INV_TXFM_4X8_FN adst, adst, 12 +INV_TXFM_4X8_FN adst, flipadst, 12 +INV_TXFM_4X8_FN adst, identity, 12 + +cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10bpc).main + psrad m0, m4, 1 + psrad m1, m6, 1 + psrad m2, 1 + psrad m3, 1 +.pass1_end: + vpbroadcastd m5, [pd_1024] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 11}, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call .pass2_main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 +.end: + vpbroadcastd m4, [pw_16384] + REPX {psrad x, 3}, m0, m1, m2, m3 + packssdw m0, m2 ; 0 1 4 5 (interleaved) + packssdw m1, m3 ; 2 3 6 7 (interleaved) + mova m2, [iadst8_12_shuf] + vpermd m0, m2, m0 ; 0 1 4 5 + vpermd m1, m2, m1 ; 2 3 6 7 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + vinserti128 m4, xm6, 1 + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + vinserti128 m5, xm7, 1 + paddw m0, m4 ; 0 1 4 5 + paddw m1, m5 ; 2 3 6 7 + vpbroadcastd m5, [pixel_12bpc_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, m4}, m0, m1 + REPX {pminsw x, m5}, m0, m1 + vextracti128 xm2, m0, 1 ; out4 out5 + vextracti128 xm3, m1, 1 ; out6 out7 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET +ALIGN function_align +.pass2_main: + ; transpose & interleave + pshufd m0, m0, q1320 + pshufd m1, m1, q1320 + pshufd m2, m2, q1320 + pshufd m3, m3, q1320 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved) + vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved) + vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) + vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) + vpbroadcastd m7, [pd_2048] + jmp m(iadst_4x8_internal_10bpc).main3 + +INV_TXFM_4X8_FN flipadst, dct, 12 +INV_TXFM_4X8_FN flipadst, adst, 12 +INV_TXFM_4X8_FN flipadst, flipadst, 12 +INV_TXFM_4X8_FN flipadst, identity, 12 + +cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10bpc).main + psrad m0, m3, 1 + psrad m1, m2, 1 + psrad m2, m6, 1 + psrad m3, m4, 1 + jmp m(iadst_4x8_internal_12bpc).pass1_end +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_4x8_internal_12bpc).pass2_main + shufpd m3, m4, m0, 0x05 ; out1 out0 + shufpd m0, m4, 0x05 ; out7 out6 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 ; out5 out4 + psignd m2, m5, m6 ; out3 out2 + jmp m(iadst_4x8_internal_12bpc).end + +INV_TXFM_4X8_FN identity, dct, 12 +INV_TXFM_4X8_FN identity, adst, 12 +INV_TXFM_4X8_FN identity, flipadst, 12 +INV_TXFM_4X8_FN identity, identity, 12 + +cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iidentity_4x8_internal_10bpc).pass1 +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m2 = in4 in5 + ; m3 = in6 in7 + vpbroadcastd m6, [pixel_12bpc_max] + call m(iidentity_4x8_internal_10bpc).pass2_end + RET + +%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x16, %3 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + vpbroadcastd xm2, [dconly_%3bpc] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3 +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m10, [pd_3072] + mova m1, [cq+32*2] + mova m3, [cq+32*6] + mova m5, [cq+32*3] + mova m7, [cq+32*7] + call .pass1_main + pmulld m0, m6, [cq+32*0] + pmulld m2, m6, [cq+32*4] + pmulld m4, m6, [cq+32*1] + pmulld m6, [cq+32*5] + call .pass1_main2 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 ; 2 3 + punpckldq m0, m4 ; 0 1 + punpckldq m4, m5, m2 ; 8 9 + punpckhdq m5, m2 ; a b + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + vextracti128 xm6, m4, 1 ; c d + vextracti128 xm7, m5, 1 ; e f + call m(idct_4x16_internal_8bpc).main + vpbroadcastd m9, [pw_2048] + vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 + vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 + vinserti128 m2, m4, xm5, 1 ; 8 9 b a + vinserti128 m3, m6, xm7, 1 ; c d f e + vpbroadcastd m8, [pixel_10bpc_max] + call .pass2_end + RET +ALIGN function_align +.pass1_main: + vpbroadcastd m4, [pd_3784] + vpbroadcastd m8, [pd_1567] + vpbroadcastd m9, [pd_2048] + vpbroadcastd m6, [pd_1448] + ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l + ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h + ret +ALIGN function_align +.pass1_main2: + paddd m0, m10 + paddd m4, m10 + paddd m8, m0, m2 + psubd m0, m2 + paddd m9, m4, m6 + psubd m4, m6 + REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h + psubd m2, m0, m1 + paddd m1, m0 + psubd m6, m4, m5 + paddd m5, m4 + paddd m0, m8, m3 + psubd m3, m8, m3 + paddd m4, m9, m7 + psubd m7, m9, m7 + ret +ALIGN function_align +.pass2_end: + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + ret +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + vpbroadcastq m5, [dstq+strideq*2] + vpbroadcastq m6, [dstq+r6 ] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + movhps [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_6144] + call m(iadst_16x4_internal_10bpc).main_end + psrad m0, m4, 13 + psrad m1, m5, 13 + psrad m2, 13 + psrad m3, 13 + psrad m4, m8, 13 + psrad m5, m9, 13 + psrad m6, 13 + psrad m7, 13 + jmp tx2q +.pass2: + call .pass2_main + vpbroadcastd m5, [pw_2048] + vpbroadcastd m8, [pixel_10bpc_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+r6 ] + movhps xm4, [dstq+strideq*0] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movhps [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm5 + movq [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.pass2_main: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + punpckldq m4, m5, m2 + punpckhdq m5, m2 + vpblendd m3, m0, m1, 0x33 + vpblendd m0, m1, 0xcc + shufpd m2, m5, m4, 0x05 + shufpd m4, m5, 0x05 + vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5 + vinserti128 m0, xm3, 1 ; 0 3 2 1 + vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? + vinserti128 m2, xm4, 1 ; b 8 9 a + call m(iadst_4x16_internal_8bpc).main2 + vpbroadcastd m5, [pw_2896x8] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + ret +ALIGN function_align +.main: + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 2] + vbroadcasti128 m1, [cq+16*15] + vbroadcasti128 m5, [cq+16*13] + vbroadcasti128 m2, [cq+16* 4] + vbroadcasti128 m6, [cq+16* 6] + vbroadcasti128 m3, [cq+16*11] + vbroadcasti128 m7, [cq+16* 9] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m1, m5, 0x0c ; 15 13 + shufpd m2, m6, 0x0c ; 4 6 + shufpd m3, m7, 0x0c ; 11 9 + vbroadcasti128 m4, [cq+16* 8] + vbroadcasti128 m6, [cq+16*10] + vbroadcasti128 m5, [cq+16* 7] + vbroadcasti128 m7, [cq+16* 5] + shufpd m4, m6, 0x0c ; 8 10 + shufpd m5, m7, 0x0c ; 7 5 + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m7, [cq+16*14] + shufpd m6, m7, 0x0c ; 12 14 + vbroadcasti128 m7, [cq+16* 3] + vbroadcasti128 m8, [cq+16* 1] + shufpd m7, m8, 0x0c ; 3 1 +.main2: + ; expects: m12 = clip_min m13 = clip_max + vpbroadcastd m11, [pd_2048] + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1 + psubd m8, m0, m4 ; t8a t10a + paddd m0, m4 ; t0a t2a + psubd m4, m1, m5 ; t9a t11a + paddd m1, m5 ; t1a t3a + psubd m5, m2, m6 ; t12a t14a + paddd m2, m6 ; t4a t6a + psubd m6, m3, m7 ; t13a t15a + paddd m3, m7 ; t5a t7a + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8 + ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1 + ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1 + psubd m7, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + psubd m3, m4, m6 ; t12a t14a + paddd m4, m6 ; t8a t10a + psubd m6, m8, m5 ; t13a t15a + paddd m8, m5 ; t9a t11a + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8 + punpcklqdq m5, m3, m7 ; t12a t4 + punpckhqdq m3, m7 ; t14a t6 + punpckhqdq m7, m6, m2 ; t15a t7 + punpcklqdq m6, m2 ; t13a t5 + ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567 + ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10 + vpbroadcastd m10, [pd_2896] + vbroadcasti128 m9, [pw_2048_m2048] ; + + - - + punpckhqdq m2, m4, m0 ; t10a t2 + punpcklqdq m4, m0 ; t8a t0 + punpckhqdq m0, m8, m1 ; t11a t3 + punpcklqdq m8, m1 ; t9a t1 + paddd m1, m6, m7 ; out2 -out3 + psubd m6, m7 ; t14a t6 + paddd m7, m5, m3 ; -out13 out12 + psubd m5, m3 ; t15a t7 + psubd m3, m8, m0 ; t11 t3a + paddd m8, m0 ; out14 -out15 + paddd m0, m4, m2 ; -out1 out0 + psubd m4, m2 ; t10 t2a + REPX {pmaxsd x, m12}, m6, m5, m3, m4 + REPX {pminsd x, m13}, m6, m5, m3, m4 + REPX {pmulld x, m10}, m6, m5, m3, m4 + paddd m6, m11 + paddd m4, m11 + paddd m2, m6, m5 ; -out5 out4 + psubd m6, m5 ; out10 -out11 + psubd m5, m4, m3 ; -out9 out8 + paddd m3, m4 ; out6 -out7 + REPX {psrad x, 12}, m2, m3, m5, m6 + REPX {psignd x, m9}, m1, m8, m3, m6 + pshufd m9, m9, q1032 + REPX {psignd x, m9}, m0, m7, m2, m5 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_6144] + call m(iadst_16x4_internal_10bpc).main_end + psrad m0, m3, 13 + psrad m1, m2, 13 + psrad m2, m5, 13 + psrad m3, m4, 13 + psrad m4, m7, 13 + psrad m5, m6, 13 + psrad m6, m9, 13 + psrad m7, m8, 13 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_10bpc).pass2_main + vpbroadcastd m5, [pw_2048] + vpbroadcastd m8, [pixel_10bpc_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+r6 ] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0x30 + vpblendd m4, m6, 0xc0 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 + movhps [dstq+strideq*2], xm5 + movhps [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 + vpbroadcastd m7, [pd_5793] + pmulld m0, m7, [cq+32*0] + pmulld m4, m7, [cq+32*1] + pmulld m1, m7, [cq+32*2] + pmulld m5, m7, [cq+32*3] + pmulld m2, m7, [cq+32*4] + pmulld m6, m7, [cq+32*5] + pmulld m3, m7, [cq+32*6] + pmulld m7, [cq+32*7] + vpbroadcastd m8, [pd_6144] + REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 + REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m7, [pw_1697x16] + vpbroadcastd m8, [pw_2048] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + vpbroadcastd m4, [pixel_10bpc_max] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + lea r6, [strideq*5] + pxor m3, m3 + punpckhdq m5, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + punpckldq m6, m7, m1 ; 8 9 c d + punpckhdq m7, m1 ; a b e f + pmulhrsw m0, m8 + call .write_2x4x2 + pmulhrsw m0, m5, m8 + call .write_2x4x2 + pmulhrsw m0, m6, m8 + lea dstq, [dstq+strideq*4] + call .write_2x4x2 + pmulhrsw m0, m7, m8 + call .write_2x4x2 + ret +ALIGN function_align +.write_2x4x2: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + vpbroadcastq m2, [dstq+strideq*4] + vpblendd m1, m2, 0x30 + vpbroadcastq m2, [dstq+r6 ] + vpblendd m1, m2, 0xc0 + mova [cq+32*0], m3 + mova [cq+32*1], m3 + add cq, 32*2 + paddw m1, m0 + pmaxsw m1, m3 + pminsw m1, m4 + vextracti128 xm2, m1, 1 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + movq [dstq+strideq*4], xm2 + movhps [dstq+r6 ], xm2 + lea dstq, [dstq+strideq*2] + ret + +INV_TXFM_4X16_FN dct, dct, 12 +INV_TXFM_4X16_FN dct, identity, 12 +INV_TXFM_4X16_FN dct, adst, 12 +INV_TXFM_4X16_FN dct, flipadst, 12 + +cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(idct_4x16_internal_10bpc).pass1 +.pass2: + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m9, m2, m3 + punpckhdq m2, m3 + punpckldq m1, m4, m5 + punpckhdq m4, m5 + punpckldq m3, m6, m7 + punpckhdq m6, m7 + punpcklqdq m5, m0, m2 ; 2 6 + punpckhqdq m12, m0, m2 ; 3 7 + punpcklqdq m0, m8, m9 ; 0 4 + punpckhqdq m10, m8, m9 ; 1 5 + punpcklqdq m2, m1, m3 ; 8 12 + punpckhqdq m13, m1, m3 ; 9 13 + punpcklqdq m9, m4, m6 ; 10 14 + punpckhqdq m4, m6 ; 11 15 + vperm2i128 m1, m5, m9, 0x20 ; 2 10 + vperm2i128 m3, m9, m5, 0x31 ; 14 6 + vpermq m11, m4, q1302 ; 15 11 + ; interleave + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13 + REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13 + call m(idct_16x4_internal_10bpc).pass1_main + vpermq m6, m12, q1302 ; 7 3 + vpermq m5, m13, q3120 ; 9 13 + call m(idct_16x4_internal_10bpc).pass1_main2 + call m(idct_16x4_internal_10bpc).pass1_main3 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + mova m4, [idct16_12_shuf] + REPX {vpermd x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m9, [pw_16384] + vpbroadcastd m8, [pixel_12bpc_max] + call m(idct_4x16_internal_10bpc).pass2_end + RET + +INV_TXFM_4X16_FN adst, dct, 12 +INV_TXFM_4X16_FN adst, adst, 12 +INV_TXFM_4X16_FN adst, flipadst, 12 +INV_TXFM_4X16_FN adst, identity, 12 + +cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call .main_pass1 + psrad m0, m4, 12 + psrad m1, m5, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, m8, 12 + psrad m5, m9, 12 + psrad m6, 12 + psrad m7, 12 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose_16x4 + call m(iadst_4x16_internal_10bpc).main2 + pshufd m4, m5, q1032 + psrad m5, m6, 3 + pshufd m6, m7, q1032 + psrad m7, m8, 3 + REPX {pshufd x, x, q1032}, m0, m2 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6 +.pass2_end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + mova m4, [iadst16_12_shuf] + REPX {vpermd x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m9, [pw_16384] + vpbroadcastd m8, [pixel_12bpc_max] + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call m(iadst_4x16_internal_10bpc).write_4x4 + pmulhrsw m0, m9, m1 + call m(iadst_4x16_internal_10bpc).write_4x4 + pmulhrsw m0, m9, m2 + call m(iadst_4x16_internal_10bpc).write_4x4 + pmulhrsw m0, m9, m3 + call m(iadst_4x16_internal_10bpc).write_4x4 + RET +ALIGN function_align +.transpose_16x4: + ; transpose & interleave + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m9, m2, m3 + punpckhdq m2, m3 + punpckldq m1, m4, m5 + punpckhdq m4, m5 + punpckldq m3, m6, m7 + punpckhdq m6, m7 + punpcklqdq m10, m8, m0 + punpckhqdq m0, m8 + punpcklqdq m11, m9, m2 + punpckhqdq m2, m9 + punpcklqdq m8, m1, m4 + punpckhqdq m4, m1 + punpcklqdq m9, m3, m6 + punpckhqdq m6, m3 + vperm2i128 m5, m0, m2, 0x31 ; 7 5 + vperm2i128 m7, m0, m2, 0x20 ; 3 1 + vperm2i128 m0, m10, m11, 0x20 ; 0 2 + vperm2i128 m2, m10, m11, 0x31 ; 4 6 + vperm2i128 m1, m4, m6, 0x31 ; 15 13 + vperm2i128 m3, m4, m6, 0x20 ; 11 9 + vperm2i128 m4, m8, m9, 0x20 ; 8 10 + vperm2i128 m6, m8, m9, 0x31 ; 12 14 + ret +ALIGN function_align +.main_pass1: + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_3072] + paddd m10, m4, m5 + psubd m4, m3 + psubd m5, m3 + paddd m3, m10 + psubd m8, m7, m1 + paddd m7, m9 + psubd m9, m1 + paddd m7, m1 + REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 + REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 + paddd m6, m0 + ret + +INV_TXFM_4X16_FN flipadst, dct, 12 +INV_TXFM_4X16_FN flipadst, adst, 12 +INV_TXFM_4X16_FN flipadst, flipadst, 12 +INV_TXFM_4X16_FN flipadst, identity, 12 + +cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call m(iadst_4x16_internal_12bpc).main_pass1 + psrad m0, m3, 12 + psrad m1, m2, 12 + psrad m2, m5, 12 + psrad m3, m4, 12 + psrad m4, m7, 12 + psrad m5, m6, 12 + psrad m6, m9, 12 + psrad m7, m8, 12 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_4x16_internal_12bpc).transpose_16x4 + call m(iadst_4x16_internal_10bpc).main2 + pshufd m4, m3, q1032 + psrad m3, m5, 3 + psrad m5, m2, 3 + pshufd m2, m6, q1032 + pshufd m6, m1, q1032 + psrad m1, m7, 3 + psrad m7, m0, 3 + pshufd m0, m8, q1032 + REPX {psrad x, 3}, m0, m2, m4, m6 + jmp m(iadst_4x16_internal_12bpc).pass2_end + +INV_TXFM_4X16_FN identity, dct, 12 +INV_TXFM_4X16_FN identity, adst, 12 +INV_TXFM_4X16_FN identity, flipadst, 12 +INV_TXFM_4X16_FN identity, identity, 12 + +cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [pd_1697] + mova m0, [cq+32*0] + mova m4, [cq+32*1] + mova m1, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m9, [pd_6144] + pmulld m2, m8, m0 + pmulld m6, m8, m4 + pmulld m3, m8, m1 + pmulld m7, m8, m5 + mova m10, [cq+32*4] + mova m11, [cq+32*5] + mova m12, [cq+32*6] + mova m13, [cq+32*7] + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m0, m2 + pmulld m2, m8, m10 + paddd m4, m6 + pmulld m6, m8, m11 + paddd m1, m3 + pmulld m3, m8, m12 + paddd m5, m7 + pmulld m7, m8, m13 + REPX {psrad x, 1 }, m0, m4, m1, m5 + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m2, m10 + paddd m6, m11 + paddd m3, m12 + paddd m7, m13 + REPX {psrad x, 1 }, m2, m6, m3, m7 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m8, [pd_5793] + vpbroadcastd m9, [pd_1024] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m8, [pw_16384] + vpbroadcastd m4, [pixel_12bpc_max] + call m(iidentity_4x16_internal_10bpc).pass2_end + RET + +%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 8x4, %3 +%ifidn %1_%2, dct_dct + vpbroadcastd m2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +%else + jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly +%endif +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, identity +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst + +cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.pass1: + vbroadcasti128 m1, [cq+16*1] + vbroadcasti128 m0, [cq+16*5] + vbroadcasti128 m2, [cq+16*3] + vbroadcasti128 m3, [cq+16*7] + vpbroadcastd m6, [pd_2896] + shufpd m1, m0, 0x0c ; 1 5 + shufpd m3, m2, 0x0c ; 7 3 + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m4, [cq+16*2] + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m5, [cq+16*6] + vpbroadcastd m7, [pd_2048] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m2, m5, 0x0c ; 4 6 + REPX {pmulld x, m6}, m1, m3, m0, m2 + REPX {paddd x, m7}, m1, m3, m0, m2 + REPX {psrad x, 12}, m1, m3, m0, m2 + call .main + psubd m3, m0, m4 ; out7 out6 (interleaved) + paddd m0, m4 ; out0 out1 (interleaved) + paddd m1, m2, m5 ; out3 out2 (interleaved) + psubd m2, m5 ; out4 out5 (interleaved) + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp tx2q +.pass2: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q2031 ; out2 out3 + jmp m(iadst_8x4_internal_10bpc).end +ALIGN function_align +.main: + ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1 + IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7 + vpbroadcastd m6, [pd_2896] + punpcklqdq m4, m1, m3 ; t4a t7a + punpckhqdq m1, m3 ; t5a t6a + psubd m3, m4, m1 ; t5a t6a + paddd m4, m1 ; t4 t7 + REPX {pmaxsd x, m8}, m3, m4, m0, m2 + REPX {pminsd x, m9}, m3, m4, m0, m2 + pmulld m3, m6 + pshufd m1, m3, q1032 + paddd m3, m7 + psubd m5, m3, m1 + paddd m1, m3 + psrad m5, 12 + psrad m1, 12 + vpblendd m5, m4, 0x33 ; t4 t5 + punpckhqdq m4, m1 ; t7 t6 + ret + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_10bpc).main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + call .pass2_main + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q3120 ; out2 out3 +.end: + vpbroadcastd m1, [pw_2048] + pmulhrsw m0, m1 + pmulhrsw m1, m2 + vpbroadcastd m5, [pixel_10bpc_max] +.end2: + mova xm2, [dstq+strideq*0] + vinserti128 m2, [dstq+strideq*1], 1 + lea r6, [dstq+strideq*2] + mova xm3, [r6 +strideq*0] + vinserti128 m3, [r6 +strideq*1], 1 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [r6 +strideq*0], xm1 + vextracti128 [r6 +strideq*1], m1, 1 + RET +ALIGN function_align +.pass2_main: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + lea r6, [deint_shuf+128] + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + jmp m(iadst_8x4_internal_8bpc).main +ALIGN function_align +.main: + vpbroadcastd m1, [pd_2896] + pmulld m0, m1, [cq+32*0] + pmulld m3, m1, [cq+32*3] + pmulld m2, m1, [cq+32*2] + pmulld m1, [cq+32*1] + vpbroadcastd m4, [pd_2048] + REPX {paddd x, m4}, m0, m3, m2, m1 + REPX {psrad x, 12}, m0, m3, m2, m1 +.main2: + IADST4_1D + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_10bpc).main + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_10bpc).pass2_main + vpermq m2, m0, q2031 + vpermq m0, m1, q2031 + jmp m(iadst_8x4_internal_10bpc).end + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m4, [pd_2896] + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpermq m2, [cq+32*2], q3120 + vpermq m3, [cq+32*3], q3120 + vpbroadcastd m7, [pd_2048] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {paddd x, x }, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m5, [pixel_10bpc_max] + vpbroadcastd m4, [pw_1697x8] + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m1, m4, m0 + pmulhrsw m4, m2 + paddsw m0, m1 + paddsw m2, m4 + packssdw m7, m7 ; pw_2048 +.pass2_end: + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + lea r6, [dstq+strideq*2] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m7 + pmulhrsw m0, m7 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova xm2, [dstq+strideq*0] + vinserti128 m2, [r6 +strideq*0], 1 + mova xm3, [dstq+strideq*1] + vinserti128 m3, [r6 +strideq*1], 1 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [r6 +strideq*0], m0, 1 + vextracti128 [r6 +strideq*1], m1, 1 + RET + +INV_TXFM_8X4_FN dct, dct, 12 +INV_TXFM_8X4_FN dct, identity, 12 +INV_TXFM_8X4_FN dct, adst, 12 +INV_TXFM_8X4_FN dct, flipadst, 12 + +cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + jmp m(idct_8x4_internal_10bpc).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12bpc).transpose_4x8 + IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(iadst_8x4_internal_12bpc).end + +INV_TXFM_8X4_FN adst, dct, 12 +INV_TXFM_8X4_FN adst, adst, 12 +INV_TXFM_8X4_FN adst, flipadst, 12 +INV_TXFM_8X4_FN adst, identity, 12 + +cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + call m(iadst_4x8_internal_10bpc).main2 + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call .pass2_main + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass2_end: + REPX {psrad x, 12}, m0, m1, m2, m3 +.end: + vpbroadcastd m4, [pw_16384] + REPX {psrad x, 3}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m2, m4 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m1, m1, q3120 ; out2 out3 + vpbroadcastd m5, [pixel_12bpc_max] + jmp m(iadst_8x4_internal_10bpc).end2 +ALIGN function_align +.pass2_main: + call .transpose_4x8 + jmp m(iadst_8x4_internal_10bpc).main2 +ALIGN function_align +.transpose_4x8: + ; deinterleave + pshufd m0, m0, q3120 + pshufd m1, m1, q3120 + pshufd m2, m2, q3120 + pshufd m3, m3, q3120 + ; transpose + punpcklqdq m4, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m5, m2, m3 + punpckhqdq m2, m3 + vperm2i128 m1, m0, m2, 0x20 ; out1 + vperm2i128 m3, m0, m2, 0x31 ; out3 + vperm2i128 m2, m4, m5, 0x31 ; out2 + vperm2i128 m0, m4, m5, 0x20 ; out0 + ret + +INV_TXFM_8X4_FN flipadst, dct, 12 +INV_TXFM_8X4_FN flipadst, adst, 12 +INV_TXFM_8X4_FN flipadst, flipadst, 12 +INV_TXFM_8X4_FN flipadst, identity, 12 + +cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + call m(iadst_4x8_internal_10bpc).main2 + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12bpc).pass2_main + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m3, m5, m4 + paddd m2, m5, m6 + jmp m(iadst_8x4_internal_12bpc).pass2_end + +INV_TXFM_8X4_FN identity, dct, 12 +INV_TXFM_8X4_FN identity, adst, 12 +INV_TXFM_8X4_FN identity, flipadst, 12 +INV_TXFM_8X4_FN identity, identity, 12 + +cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iidentity_8x4_internal_10bpc).pass1 +.pass2: + ; m0 = in0 in1 (interleaved) + ; m1 = in2 in3 (interleaved) + ; m2 = in4 in5 (interleaved) + ; m3 = in6 in7 (interleaved) + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + vpbroadcastd m4, [pd_5793] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 15}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_12bpc_max] + vpbroadcastd m7, [pw_16384] + packssdw m0, m1 + packssdw m2, m3 + jmp m(iidentity_8x4_internal_10bpc).pass2_end + +%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 8x8, %3 +%ifidn %1_%2, dct_dct + vpbroadcastd m2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 +.dconly2: + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm2 + vpbroadcastw m0, xm0 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + paddsw m1, m0 + psubusw m1, m2 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%else + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly +%endif +%endif +%endmacro + +%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2] + ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a + psubd m%9, m%3, m%7 ; t6 + paddd m%3, m%7 ; t2 + psubd m%7, m%1, m%5 ; t4 + paddd m%1, m%5 ; t0 + psubd m%5, m%6, m%2 ; t7 + paddd m%6, m%2 ; t3 + psubd m%2, m%8, m%4 ; t5 + paddd m%8, m%4 ; t1 + REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a + psubd m%10, m%7, m%9 ; t7 + paddd m%7, m%9 ; out6 + vpbroadcastd m%9, [pd_1448] + psubd m%4, m%8, m%6 ; t3 + paddd m%8, m%6 ; -out7 + psubd m%6, m%1, m%3 ; t2 + paddd m%1, m%3 ; out0 + psubd m%3, m%2, m%5 ; t6 + paddd m%2, m%5 ; -out1 + REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 + REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 + REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 + psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 + paddd m%4, m%6 ; (t2 + t3) * 1448 + psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 + paddd m%3, m%10 ; (t6 + t7) * 1448 +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + vpbroadcastd m11, [pd_2048] + call .main + call .round_shift1 + jmp tx2q +.pass2: + call .transpose_8x8_packed + call m(idct_8x8_internal_8bpc).main + vpbroadcastd m12, [pw_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_8x4 + RET +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m10, m10 +.write_8x4: + mova xm8, [dstq+strideq*0] + vinserti128 m8, [dstq+strideq*1], 1 + mova xm9, [dstq+strideq*2] + vinserti128 m9, [dstq+r6 ], 1 + mova [cq+32*0], m10 + mova [cq+32*1], m10 + mova [cq+32*2], m10 + mova [cq+32*3], m10 + add cq, 32*4 + paddw m0, m8 + paddw m1, m9 + pmaxsw m0, m10 + pmaxsw m1, m10 + pminsw m0, m11 + pminsw m1, m11 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.transpose_8x8_packed: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m4, m1 + punpckldq m4, m1 + vinserti128 m1, m3, xm2, 1 + vperm2i128 m3, m2, 0x31 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret +ALIGN function_align +.main_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main: + ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + vpbroadcastd m3, [pd_2896] + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m3 }, m0, m4, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + ret +ALIGN function_align +.round_shift1: + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call .main + call .main_end + jmp tx2q +.pass2: + call m(idct_8x8_internal_10bpc).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m5, [pw_2048] + vpbroadcastd xm12, [pw_4096] + psubw m12, m5 + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.main: + mova m0, [cq+32*0] + mova m7, [cq+32*7] + mova m1, [cq+32*1] + mova m6, [cq+32*6] + mova m2, [cq+32*2] + mova m5, [cq+32*5] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + vpbroadcastd m11, [pd_2048] +.main2: + IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 + psrld m8, 10 ; pd_1 + vpbroadcastd m9, [pd_3072] + ret +ALIGN function_align +.main_end: + paddd m0, m8 + psubd m1, m8, m1 + paddd m6, m8 + psubd m7, m8, m7 + REPX {psrad x, 1 }, m0, m1, m6, m7 + ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 + ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 + psubd m8, m9, m8 ; pd_3071 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 12}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_8x8_internal_10bpc).main + call .main_end + jmp tx2q +.pass2: + call m(idct_8x8_internal_10bpc).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m12, [pw_2048] + vpbroadcastd xm5, [pw_4096] + psubw m12, m5 + vpermq m8, m3, q2031 + vpermq m9, m2, q2031 + vpermq m2, m1, q2031 + vpermq m3, m0, q2031 + pmulhrsw m0, m8, m12 + pmulhrsw m1, m9, m12 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.main_end: + paddd m10, m8, m0 + psubd m0, m8, m7 + psubd m7, m8, m1 + paddd m1, m8, m6 + psrad m0, 1 + psrad m1, 1 + psrad m6, m7, 1 + psrad m7, m10, 1 + psubd m8, m9, m8 ; pd_6143 + psubd m10, m8, m5 + paddd m5, m9, m2 + psubd m2, m8, m3 + paddd m3, m9, m4 + psrad m4, m2, 12 + psrad m2, m10, 12 + psrad m3, 12 + psrad m5, 12 + ret + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 +.pass1: + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + jmp tx2q +.pass2: + packssdw m3, m7 + vpbroadcastd m7, [pixel_10bpc_max] +.pass2_main: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + vpbroadcastd m12, [pw_4096] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m1 + punpckhdq m4, m1 + punpckhqdq m1, m0, m2 ; 1 5 + punpcklqdq m0, m2 ; 0 4 + punpcklqdq m2, m3, m4 ; 2 6 + punpckhqdq m3, m4 ; 3 7 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_2x8x2_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_2x8x2_zero + RET +.write_2x8x2_start: + lea r6, [strideq*5] + pxor m6, m6 +.write_2x8x2_zero: + mova [cq+32*0], m6 + mova [cq+32*1], m6 + mova [cq+32*2], m6 + mova [cq+32*3], m6 + add cq, 32*4 +.write_2x8x2: + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + mova xm5, [dstq+strideq*1] + vinserti128 m5, [dstq+r6 ], 1 + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m6 + pmaxsw m1, m6 + pminsw m0, m7 + pminsw m1, m7 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*4], m0, 1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*2] + ret + +%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4] + punpckldq m%9, m%1, m%2 ; aibj emfn + punpckhdq m%1, m%2 ; ckdl gohp + punpckldq m%10, m%3, m%4 ; qyrz uCvD + punpckhdq m%3, m%4 ; sAtB wExF + punpckldq m%11, m%5, m%6 ; GOHP KSLT + punpckhdq m%5, m%6 ; IQJR MUNV + punpckldq m%12, m%7, m%8 ; WeXf aibj + punpckhdq m%7, m%8 ; YgZh ckdl + punpcklqdq m%2, m%9, m%10 ; aiqy emuC + punpckhqdq m%9, m%10 ; bjrz fnvD + punpcklqdq m%4, m%1, m%3 ; cksA gowE + punpckhqdq m%10, m%1, m%3 ; dltB hpxF + punpcklqdq m%6, m%11, m%12 ; GOWe KSai + punpckhqdq m%11, m%12 ; HPXf LTbj + punpcklqdq m%8, m%5, m%7 ; IQYg MUck + punpckhqdq m%12, m%5, m%7 ; JRZh NVdl + vperm2i128 m%1, m%2, m%6, 0x20 ; out0 + vperm2i128 m%5, m%2, m%6, 0x31 ; out4 + vperm2i128 m%2, m%9, m%11, 0x20 ; out1 + vperm2i128 m%6, m%9, m%11, 0x31 ; out5 + vperm2i128 m%3, m%4, m%8, 0x20 ; out2 + vperm2i128 m%7, m%4, m%8, 0x31 ; out6 + vperm2i128 m%4, m%10, m%12, 0x20 ; out3 + vperm2i128 m%8, m%10, m%12, 0x31 ; out7 +%endmacro + +INV_TXFM_8X8_FN dct, dct, 12 +INV_TXFM_8X8_FN dct, identity, 12 +INV_TXFM_8X8_FN dct, adst, 12 +INV_TXFM_8X8_FN dct, flipadst, 12 + +cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_8x8_internal_10bpc).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose_8x8 + vpbroadcastd m11, [pd_2048] + call m(idct_8x8_internal_10bpc).main + call .round_shift4 + jmp m(iadst_8x8_internal_12bpc).pass2_end +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_12bpc_max] + lea r6, [strideq*3] + pxor m10, m10 + ret +ALIGN function_align +.transpose_8x8: + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + ret +ALIGN function_align +.round_shift4: + vpbroadcastd m1, [pd_8] + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct, 12 +INV_TXFM_8X8_FN adst, adst, 12 +INV_TXFM_8X8_FN adst, flipadst, 12 +INV_TXFM_8X8_FN adst, identity, 12 + +cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x8_internal_10bpc).pass1 +.pass2: + call .pass2_main +.pass2_end: + packssdw m0, m1 + packssdw m1, m2, m3 + REPX {vpermq x, x, q3120}, m0, m1 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + packssdw m0, m4, m5 + packssdw m1, m6, m7 + REPX {vpermq x, x, q3120}, m0, m1 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.pass2_main: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_12bpc).transpose_8x8 + vpbroadcastd m11, [pd_2048] +.pass2_main2: + call m(iadst_8x8_internal_10bpc).main2 + pslld m9, m8, 3 ; pd_8 + paddd m0, m9 + psubd m1, m9, m1 ; 8+x + paddd m6, m9 + psubd m7, m9, m7 + REPX {psrad x, 4}, m0, m1, m6, m7 + vpbroadcastd m9, [pd_17408] + psubd m8, m9, m8 ; 17407 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 15}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct, 12 +INV_TXFM_8X8_FN flipadst, adst, 12 +INV_TXFM_8X8_FN flipadst, flipadst, 12 +INV_TXFM_8X8_FN flipadst, identity, 12 + +cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x8_internal_10bpc).pass1 +.pass2: + call m(iadst_8x8_internal_12bpc).pass2_main + packssdw m7, m7, m6 + packssdw m6, m1, m0 + packssdw m1, m5, m4 + vpermq m0, m7, q3120 + vpermq m1, m1, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + packssdw m0, m3, m2 + vpermq m0, m0, q3120 + vpermq m1, m6, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET + +INV_TXFM_8X8_FN identity, dct, 12 +INV_TXFM_8X8_FN identity, adst, 12 +INV_TXFM_8X8_FN identity, flipadst, 12 +INV_TXFM_8X8_FN identity, identity, 12 + +cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(iidentity_8x8_internal_10bpc).pass1 +.pass2: + packssdw m3, m7 + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(iidentity_8x8_internal_10bpc).pass2_main + +%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 8x16, %4 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_%4bpc] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, 35 +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct_8x16_internal_8bpc).main + vpbroadcastd m12, [pw_2048] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m0, m4, m12 + pmulhrsw m1, m5, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m0, m6, m12 + pmulhrsw m1, m7, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.transpose: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 + lea r6, [deint_shuf+128] + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m3, m6 + punpckldq m3, m6 + punpckhdq m6, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + vperm2i128 m2, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vperm2i128 m3, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m7, m5, m6, 0x31 + vinserti128 m5, xm6, 1 + vperm2i128 m6, m8, m4, 0x31 + vinserti128 m4, m8, xm4, 1 + ret +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct_8x8_internal_10bpc).main_rect2 + jmp m(idct_8x8_internal_10bpc).round_shift1 +ALIGN function_align +.main_evenhalf: + paddd m1, m6, m7 ; idct8 out1 + psubd m6, m7 ; idct8 out6 + psubd m7, m0, m9 ; idct8 out7 + paddd m0, m9 ; idct8 out0 + paddd m2, m5, m4 ; idct8 out2 + psubd m5, m4 ; idct8 out5 + psubd m4, m3, m8 ; idct8 out4 + paddd m3, m8 ; idct8 out3 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +.main_oddhalf_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_fast: ; lower half zero + vpbroadcastd m7, [pd_4076] + vpbroadcastd m8, [pd_401] + vpbroadcastd m6, [pd_m1189] + vpbroadcastd m9, [pd_3920] + vpbroadcastd m5, [pd_3612] + vpbroadcastd m10, [pd_1931] + vpbroadcastd m4, [pd_m2598] + vpbroadcastd m15, [pd_3166] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_fast2 +.main_oddhalf_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf: + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a +.main_oddhalf_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t9 + paddd m0, m4 ; t8 + psubd m4, m6, m2 ; t10 + paddd m2, m6 ; t11 + psubd m6, m1, m5 ; t13 + paddd m5, m1 ; t12 + psubd m1, m7, m3 ; t14 + paddd m7, m3 ; t15 + REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 + REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2 + psubd m3, m1, m4 ; t10 + paddd m1, m4 ; t9 + psubd m4, m0, m2 ; t11a + paddd m0, m2 ; t8a + psubd m2, m8, m6 ; t13 + paddd m6, m8 ; t14 + psubd m8, m7, m5 ; t12a + paddd m7, m5 ; t15a + REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pmulld x, m14}, m2, m8, m3, m4 + paddd m2, m11 + paddd m8, m11 + paddd m5, m2, m3 ; t13a + psubd m2, m3 ; t10a + psubd m3, m8, m4 ; t11 + paddd m4, m8 ; t12 + REPX {psrad x, 12}, m5, m2, m3, m4 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, 35 + +cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + call m(iadst_8x8_internal_10bpc).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + call m(iadst_8x8_internal_10bpc).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + call m(iadst_8x8_internal_10bpc).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_8x16_internal_10bpc).transpose + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end + vpbroadcastd m8, [pw_2048] + vpbroadcastd xm12, [pw_4096] + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + psubw m12, m8 + jmp m(idct_8x16_internal_10bpc).end +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m7, m14, [cq+32*14] + pmulld m1, m14, [cq+32* 2] + pmulld m6, m14, [cq+32*12] + pmulld m2, m14, [cq+32* 4] + pmulld m5, m14, [cq+32*10] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp m(iadst_8x8_internal_10bpc).main2 + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, 35 + +cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call m(iadst_8x16_internal_10bpc).pass1_main + call m(iflipadst_8x8_internal_10bpc).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call m(iadst_8x16_internal_10bpc).pass1_main + call m(iflipadst_8x8_internal_10bpc).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call m(iadst_8x16_internal_10bpc).pass1_main + call m(iflipadst_8x8_internal_10bpc).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_8x16_internal_10bpc).transpose + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end + vpbroadcastd m12, [pw_2048] + vpbroadcastd xm13, [pw_4096] + mova m11, m0 + vpermq m0, m7, q2031 + mova m10, m1 + vpermq m1, m6, q2031 + mova m9, m2 + vpermq m2, m5, q2031 + mova m8, m3 + vpermq m3, m4, q2031 + vpermq m4, m8, q3120 + vpermq m5, m9, q3120 + vpermq m6, m10, q3120 + vpermq m7, m11, q3120 + psubw m12, m13 + jmp m(idct_8x16_internal_10bpc).end + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 +%ifnum %4 + pmulhrsw m%2, m%4 +%else ; without rounding + psraw m%2, 1 +%endif +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 0] + pmulld m8, m15, [cq+32* 1] + pmulld m1, m15, [cq+32* 2] + pmulld m9, m15, [cq+32* 3] + pmulld m2, m15, [cq+32* 4] + pmulld m10, m15, [cq+32* 5] + pmulld m3, m15, [cq+32* 6] + pmulld m11, m15, [cq+32* 7] + pmulld m4, m15, [cq+32* 8] + pmulld m12, m15, [cq+32* 9] + pmulld m5, m15, [cq+32*10] + pmulld m13, m15, [cq+32*11] + pmulld m6, m15, [cq+32*12] + pmulld m14, m15, [cq+32*13] + pmulld m7, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [cq], m7 + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m13, m7, m15 + vpbroadcastd m8, [pw_1697x16] + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13 + vpbroadcastd m7, [pixel_10bpc_max] + vpbroadcastd m12, [pw_2048] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + punpckhwd m9, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m13 + punpcklwd m6, m13 + punpckhwd m13, m4, m5 + punpcklwd m4, m5 + punpcklwd m5, m2, m3 + punpckhwd m2, m3 + punpckhdq m3, m0, m5 + punpckldq m0, m5 + punpckhdq m11, m9, m2 + punpckldq m9, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m13, m1 + punpckhdq m13, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m8, m9, m6 + punpckhqdq m9, m6 + punpcklqdq m10, m11, m13 + punpckhqdq m11, m13 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(iidentity_8x8_internal_10bpc).write_2x8x2_start + pmulhrsw m0, m12, m2 + pmulhrsw m1, m12, m3 + call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero + pmulhrsw m0, m12, m8 + pmulhrsw m1, m12, m9 + lea dstq, [dstq+strideq*4] + call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero + pmulhrsw m0, m12, m10 + pmulhrsw m1, m12, m11 + call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero + ret + +INV_TXFM_8X16_FN dct, dct, 0, 12 +INV_TXFM_8X16_FN dct, identity, 35, 12 +INV_TXFM_8X16_FN dct, adst, 0, 12 +INV_TXFM_8X16_FN dct, flipadst, 0, 12 + +cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call .transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*10], m2 + mova [cq+32*12], m4 + mova [cq+32*14], m6 + pmaxsd m0, m12, [cq+32* 1] + pmaxsd m4, m12, m1 + pmaxsd m1, m12, [cq+32* 3] + pmaxsd m2, m12, [cq+32* 5] + pmaxsd m6, m12, m5 + pmaxsd m5, m12, m3 + pmaxsd m3, m12, [cq+32* 7] + pmaxsd m7, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 2] + pmaxsd m2, m12, [cq+32* 4] + pmaxsd m3, m12, [cq+32* 6] + pmaxsd m4, m12, [cq+32* 8] + pmaxsd m5, m12, [cq+32*10] + pmaxsd m6, m12, [cq+32*12] + pmaxsd m7, m12, [cq+32*14] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + vpbroadcastd m11, [pd_8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_16x8_internal_10bpc).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 +.end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m2, q3120 + vpermq m1, m3, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.transpose: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + call m(idct_8x8_internal_12bpc).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + mova [cq+32* 3], m3 + mova [cq+32* 4], m4 + mova [cq+32* 5], m5 + mova [cq+32* 6], m6 + mova [cq+32* 7], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, m12 + mova m5, m13 + mova m6, m14 + mova m7, m15 + jmp m(idct_8x8_internal_12bpc).transpose_8x8 + +INV_TXFM_8X16_FN adst, dct, 0, 12 +INV_TXFM_8X16_FN adst, adst, 0, 12 +INV_TXFM_8X16_FN adst, flipadst, 0, 12 +INV_TXFM_8X16_FN adst, identity, 35, 12 + +cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call .pass2_main + call m(iadst_16x8_internal_10bpc).pass1_rotations +.pass2_end: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp m(idct_8x16_internal_12bpc).end +ALIGN function_align +.pass2_main: + call m(idct_8x16_internal_12bpc).transpose + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*15], m7 + pmaxsd m0, m13, [cq+32* 2] ; 2 + pmaxsd m3, m13, m1 ; 9 + pmaxsd m1, m13, m5 ; 13 + pmaxsd m4, m13, m2 ; 10 + pmaxsd m2, m13, [cq+32* 6] ; 6 + pmaxsd m5, m13, [cq+32* 5] ; 5 + pmaxsd m6, m13, m6 ; 14 + pmaxsd m7, m13, [cq+32* 1] ; 1 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m12, [pd_2048] + vpbroadcastd m15, [pd_2896] + call m(iadst_16x8_internal_10bpc).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m1, m13, [cq+32*15] ; 15 + pmaxsd m2, m13, [cq+32* 4] ; 4 + pmaxsd m3, m13, [cq+32*11] ; 11 + pmaxsd m4, m13, [cq+32* 8] ; 8 + pmaxsd m5, m13, [cq+32* 7] ; 7 + pmaxsd m6, m13, [cq+32*12] ; 12 + pmaxsd m7, m13, [cq+32* 3] ; 3 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_16x8_internal_10bpc).main_part2 + vpbroadcastd m14, [pd_17408] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_17407 + pslld m15, 3 ; pd_8 + ret + +INV_TXFM_8X16_FN flipadst, dct, 0, 12 +INV_TXFM_8X16_FN flipadst, adst, 0, 12 +INV_TXFM_8X16_FN flipadst, flipadst, 0, 12 +INV_TXFM_8X16_FN flipadst, identity, 35, 12 + +cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call m(iadst_8x16_internal_12bpc).pass2_main + call m(iflipadst_16x8_internal_10bpc).pass1_rotations + jmp m(iadst_8x16_internal_12bpc).pass2_end + +INV_TXFM_8X16_FN identity, dct, 0, 12 +INV_TXFM_8X16_FN identity, adst, 0, 12 +INV_TXFM_8X16_FN identity, flipadst, 0, 12 +INV_TXFM_8X16_FN identity, identity, 0, 12 + +cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_8x16_internal_10bpc).pass1 +.pass2: + call .pass2_main + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m13, m7, m15 + vpbroadcastd m7, [pixel_12bpc_max] + vpbroadcastd m12, [pw_16384] + call m(iidentity_8x16_internal_10bpc).pass2_end + RET +ALIGN function_align +.pass2_main: + mova [cq], m7 + vpbroadcastd m7, [clip_18b_min] + REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmaxsd m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [clip_18b_max] + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pminsd m15, [cq] + mova [cq], m7 + vpbroadcastd m7, [pd_5793] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmulld m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [pd_1024] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [cq] + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ret + +%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 16x4, %3 +%ifidn %1_%2, dct_dct + vpbroadcastd m3, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 +.dconly2: + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm3 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+strideq*0] + paddsw m2, m0, [dstq+strideq*1] + psubusw m1, m3 + psubusw m2, m3 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%else + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly +%endif +%endif +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, identity +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst + +cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.pass1: + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 4] + vbroadcasti128 m1, [cq+16* 2] + vbroadcasti128 m7, [cq+16* 6] + vbroadcasti128 m5, [cq+16*10] + vbroadcasti128 m2, [cq+16* 8] + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m3, [cq+16*14] + shufpd m0, m4, 0x0c ; 0 4 + shufpd m1, m5, 0x0c ; 2 10 + shufpd m2, m6, 0x0c ; 8 12 + shufpd m3, m7, 0x0c ; 14 6 + call .pass1_main + vbroadcasti128 m10, [cq+16* 1] + vbroadcasti128 m4, [cq+16* 5] + vbroadcasti128 m11, [cq+16*15] + vbroadcasti128 m5, [cq+16*11] + shufpd m10, m4, 0x0c ; 1 5 + shufpd m11, m5, 0x0c ; 15 11 + vbroadcasti128 m5, [cq+16* 9] + vbroadcasti128 m4, [cq+16*13] + shufpd m5, m4, 0x0c ; 9 13 + vbroadcasti128 m6, [cq+16* 7] + vbroadcasti128 m4, [cq+16* 3] + shufpd m6, m4, 0x0c ; 7 3 + call .pass1_main2 + pcmpeqd m4, m4 + REPX {psubd x, m4}, m0, m1, m2, m3 + call .pass1_main3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call .transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(idct_16x4_internal_8bpc).main +.end: + vpbroadcastd m4, [pw_2048] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_10bpc_max] +.end2: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] +.end3: + lea r6, [dstq+strideq*2] + paddw m2, [r6 +strideq*0] + paddw m3, [r6 +strideq*1] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + REPX {pminsw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [r6 +strideq*0], m2 + mova [r6 +strideq*1], m3 + RET +ALIGN function_align +.pass1_main: + vpbroadcastd m7, [pd_2048] + call m(idct_8x4_internal_10bpc).main + psubd m3, m0, m4 ; idct8 out7 out6 + paddd m0, m4 ; idct8 out0 out1 + paddd m1, m2, m5 ; idct8 out3 out2 + psubd m2, m5 ; idct8 out4 out5 + ret +ALIGN function_align +.pass1_main2: + ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 + ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 + vbroadcasti128 m12, [pd_3784_m3784] + psubd m4, m10, m5 + paddd m10, m5 ; t8 t11 + psignd m4, m12 ; t9 t10 + psubd m5, m11, m6 + paddd m11, m6 ; t15 t12 + psignd m5, m12 ; t14 t13 + vpbroadcastd m6, [pd_1567] + vpbroadcastd m13, [pd_3784] + REPX {pmaxsd x, m8}, m5, m4 + REPX {pminsd x, m9}, m5, m4 + pmulld m12, m5 + pmulld m5, m6 + vbroadcasti128 m6, [pd_1567_m1567] + pmulld m13, m4 + pmulld m4, m6 + REPX {pmaxsd x, m8}, m10, m11, m0, m1 + REPX {pminsd x, m9}, m10, m11, m0, m1 + paddd m12, m7 + paddd m5, m7 + paddd m4, m12 + psubd m5, m13 + psrad m4, 12 ; t14a t10a + psrad m5, 12 ; t9a t13a + vpbroadcastd m12, [pd_2896] + punpckhqdq m6, m11, m5 + punpcklqdq m11, m4 + punpckhqdq m4, m10, m4 + punpcklqdq m10, m5 + psubd m5, m11, m6 ; t12a t13 + paddd m11, m6 ; t15a t14 + psubd m6, m10, m4 ; t11a t10 + paddd m10, m4 ; t8a t9 + REPX {pmaxsd x, m8}, m5, m6 + REPX {pminsd x, m9}, m5, m6 + pmulld m5, m12 + pmulld m6, m12 + REPX {pmaxsd x, m8}, m2, m3, m11, m10 + REPX {pminsd x, m9}, m2, m3, m11, m10 + ret +ALIGN function_align +.pass1_main3: + paddd m5, m7 + psubd m4, m5, m6 + paddd m5, m6 + psrad m4, 12 ; t11 t10a + psrad m5, 12 ; t12 t13a + psubd m7, m0, m11 ; out15 out14 + paddd m0, m11 ; out0 out1 + psubd m6, m1, m5 ; out12 out13 + paddd m1, m5 ; out3 out2 + psubd m5, m2, m4 ; out11 out10 + paddd m2, m4 ; out4 out5 + psubd m4, m3, m10 ; out8 out9 + paddd m3, m10 ; out7 out6 + REPX {pshufd x, x, q1032}, m1, m3, m5, m7 + ret +ALIGN function_align +.transpose_4x16_packed: + vbroadcasti128 m8, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + REPX {pshufb x, m8}, m0, m2, m4, m6 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m2, m4, m6 + punpcklqdq m4, m6 + vperm2i128 m3, m1, m2, 0x31 + vinserti128 m1, xm2, 1 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_4x16_internal_10bpc).main + psrad m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + paddd m4, m5, m11 + paddd m5, m6, m11 + paddd m6, m7, m11 + paddd m7, m8, m11 +.pass1_end: + REPX {pshufd x, x, q1032}, m0, m2, m4, m6 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(iadst_16x4_internal_8bpc).main + jmp m(idct_16x4_internal_10bpc).end +ALIGN function_align +.main: + vpbroadcastd m6, [pd_1321] + mova m0, [cq+32*0] + mova m1, [cq+32*1] + vpbroadcastd m7, [pd_2482] + mova m2, [cq+32*6] + mova m3, [cq+32*7] + pmulld m4, m0, m6 + pmulld m5, m1, m6 ; 1321*in0 + pmulld m9, m2, m7 + pmulld m8, m3, m7 ; 2482*in3 + paddd m4, m9 + paddd m8, m5 ; 1321*in0 + 2482*in3 + pmulld m5, m0, m7 + pmulld m9, m1, m7 ; 2482*in0 + paddd m0, m2 + paddd m1, m3 ; in0 + in3 + paddd m7, m6 ; pd_3803 + pmulld m2, m7 + pmulld m3, m7 ; 3803*in3 + psubd m5, m2 + psubd m9, m3 ; 2482*in0 - 3803*in3 + mova m2, [cq+32*4] + pmulld m10, m7, m2 + pmulld m3, m6, m2 + psubd m2, m0 + mova m0, [cq+32*5] + pmulld m7, m0 ; 3803*in2 + pmulld m6, m0 ; 1321*in2 + psubd m0, m1 ; in2 - in0 - in3 + vpbroadcastd m1, [pd_m3344] + paddd m4, m10 + paddd m7, m8 ; t0 + psubd m5, m3 + psubd m9, m6 ; t1 + pmulld m2, m1 + pmulld m0, m1 ; t2 + pmulld m3, m1, [cq+32*2] + pmulld m1, [cq+32*3] ; -t3 + ret +ALIGN function_align +.main_end: + ; expects: m6 = rnd + paddd m5, m6 + paddd m9, m6 + paddd m10, m4, m5 + paddd m4, m6 + paddd m8, m7, m6 + paddd m7, m9 + psubd m4, m3 ; out0 (unshifted) + psubd m5, m3 ; out1 (unshifted) + paddd m2, m6 ; out2 (unshifted) + paddd m3, m10 ; out3 (unshifted) + psubd m8, m1 ; out4 (unshifted) + psubd m9, m1 ; out5 (unshifted) + paddd m6, m0 ; out6 (unshifted) + paddd m7, m1 ; out7 (unshifted) + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_4x16_internal_10bpc).main + psrad m11, 11 ; pd_1 + paddd m4, m3, m11 + paddd m3, m5, m11 + paddd m5, m2, m11 + paddd m2, m6, m11 + paddd m6, m1, m11 + paddd m1, m7, m11 + paddd m7, m0, m11 + paddd m0, m8, m11 + jmp m(iadst_16x4_internal_10bpc).pass1_end +.pass2: + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m4, [pw_2048] + pmulhrsw m5, m3, m4 + pmulhrsw m6, m2, m4 + pmulhrsw m2, m1, m4 + pmulhrsw m3, m0, m4 + paddw m0, m5, [dstq+strideq*0] + paddw m1, m6, [dstq+strideq*1] + vpbroadcastd m5, [pixel_10bpc_max] + jmp m(idct_16x4_internal_10bpc).end3 + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [pd_5793] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m4, [cq+32*4], q3120 ; 8 9 + vpermq m5, [cq+32*5], q3120 ; a b + vpermq m6, [cq+32*6], q3120 ; c d + vpermq m7, [cq+32*7], q3120 ; e f + vpbroadcastd m9, [pd_3072] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + vpbroadcastd m7, [pw_1697x8] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_16x4_internal_10bpc).end + +INV_TXFM_16X4_FN dct, dct, 12 +INV_TXFM_16X4_FN dct, identity, 12 +INV_TXFM_16X4_FN dct, adst, 12 +INV_TXFM_16X4_FN dct, flipadst, 12 + +cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + jmp m(idct_16x4_internal_10bpc).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ; deinterleave + REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + ; transpose + punpcklqdq m8, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m9, m2, m3 + punpckhqdq m2, m3 + punpcklqdq m10, m4, m5 + punpckhqdq m4, m5 + punpcklqdq m11, m6, m7 + punpckhqdq m6, m7 + vperm2i128 m3, m0, m2, 0x31 ; out6 + vperm2i128 m1, m0, m2, 0x20 ; out2 + vperm2i128 m7, m4, m6, 0x31 ; out7 + vperm2i128 m5, m4, m6, 0x20 ; out3 + vperm2i128 m13, m10, m11, 0x31 ; out5 + vperm2i128 m12, m10, m11, 0x20 ; out1 + vperm2i128 m11, m8, m9, 0x31 ; out4 + vperm2i128 m10, m8, m9, 0x20 ; out0 + call m(idct_4x16_internal_10bpc).pass1_main + pmulld m0, m6, m10 + pmulld m2, m6, m11 + pmulld m4, m6, m12 + pmulld m6, m13 + vpbroadcastd m10, [pd_17408] + call m(idct_4x16_internal_10bpc).pass1_main2 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m5, [pixel_12bpc_max] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + jmp m(idct_16x4_internal_10bpc).end2 + +INV_TXFM_16X4_FN adst, dct, 12 +INV_TXFM_16X4_FN adst, adst, 12 +INV_TXFM_16X4_FN adst, flipadst, 12 +INV_TXFM_16X4_FN adst, identity, 12 + +cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_16x4_internal_10bpc).pass1 +.pass2: + call .pass2_main + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + jmp m(idct_16x4_internal_10bpc).end2 +ALIGN function_align +.pass2_main: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7 + pmaxsd m8, m4, m12 + pmaxsd m9, m5, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12bpc).transpose_4x8 + mova [cq+32*0], m0 + mova [cq+32*2], m1 + mova [cq+32*4], m2 + mova [cq+32*6], m3 + pminsd m0, m8, m13 + pminsd m1, m9, m13 + pminsd m2, m6, m13 + pminsd m3, m7, m13 + call m(iadst_8x4_internal_12bpc).transpose_4x8 + mova [cq+32*1], m0 + mova [cq+32*3], m1 + mova [cq+32*5], m2 + mova [cq+32*7], m3 + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_2048] + call m(iadst_16x4_internal_10bpc).main_end + psrad m0, m4, 15 + psrad m1, m5, 15 + psrad m2, 15 + psrad m3, 15 + psrad m4, m8, 15 + psrad m5, m9, 15 + psrad m6, 15 + psrad m7, 15 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m4, [pw_16384] + vpbroadcastd m5, [pixel_12bpc_max] + ret + +INV_TXFM_16X4_FN flipadst, dct, 12 +INV_TXFM_16X4_FN flipadst, adst, 12 +INV_TXFM_16X4_FN flipadst, flipadst, 12 +INV_TXFM_16X4_FN flipadst, identity, 12 + +cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_16x4_internal_10bpc).pass1 +.pass2: + call m(iadst_16x4_internal_12bpc).pass2_main + vpermq m7, m0, q3120 + vpermq m6, m1, q3120 + vpermq m1, m2, q3120 + vpermq m0, m3, q3120 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m6, m4 + pmulhrsw m3, m7, m4 + jmp m(idct_16x4_internal_10bpc).end2 + +INV_TXFM_16X4_FN identity, dct, 12 +INV_TXFM_16X4_FN identity, adst, 12 +INV_TXFM_16X4_FN identity, flipadst, 12 +INV_TXFM_16X4_FN identity, identity, 12 + +cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [pd_1697] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpbroadcastd m9, [pd_3072] + pmulld m4, m8, m0 + pmulld m5, m8, m1 + pmulld m6, m8, m2 + pmulld m7, m8, m3 + vpermq m10, [cq+32*4], q3120 ; 8 9 + vpermq m11, [cq+32*5], q3120 ; a b + vpermq m12, [cq+32*6], q3120 ; c d + vpermq m13, [cq+32*7], q3120 ; e f + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m0, m4 + pmulld m4, m8, m10 + paddd m1, m5 + pmulld m5, m8, m11 + paddd m2, m6 + pmulld m6, m8, m12 + paddd m3, m7 + pmulld m7, m8, m13 + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m4, m10 + paddd m5, m11 + paddd m6, m12 + paddd m7, m13 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m8, [pd_5793] + vpbroadcastd m9, [pd_2048] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + vpbroadcastd m4, [pw_16384] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_12bpc_max] + jmp m(idct_16x4_internal_10bpc).end2 + +%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 16x8, %3 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%3bpc] + mov [cq], eobd ; 0 + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst + +cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + pmulld m0, m14, [cq+32* 1] + pmulld m1, m14, [cq+32* 3] + pmulld m2, m14, [cq+32* 5] + pmulld m3, m14, [cq+32* 7] + pmulld m4, m14, [cq+32* 9] + pmulld m5, m14, [cq+32*11] + pmulld m6, m14, [cq+32*13] + pmulld m7, m14, [cq+32*15] + vpbroadcastd m11, [pd_2048] + lea r6, [rsp+32*4] + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf + psrld m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call .pass1_rotations + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m10, [pw_2048] +.end: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call .write_16x4_start +.end2: + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m10 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m10 + call .write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + ret +ALIGN function_align +.transpose: + lea r6, [deint_shuf+128] +.transpose2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 +.transpose3: + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m4, m6 + punpckldq m4, m6 + punpckldq m6, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m3, m5 + punpckldq m3, m5 + punpcklqdq m5, m6, m3 + punpckhqdq m6, m3 + punpckhqdq m3, m2, m7 + punpcklqdq m2, m7 + punpcklqdq m7, m8, m1 + punpckhqdq m8, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m5, 0x31 + vinserti128 m0, xm5, 1 + vperm2i128 m5, m1, m6, 0x31 + vinserti128 m1, xm6, 1 + vperm2i128 m6, m2, m7, 0x31 + vinserti128 m2, xm7, 1 + vperm2i128 m7, m3, m8, 0x31 + vinserti128 m3, xm8, 1 + ret +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_10bpc_max] + lea r3, [strideq*3] + pxor m8, m8 +.write_16x4_zero: + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7 + add cq, 32*8 +.write_16x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3 ] + REPX {pmaxsw x, m8}, m0, m1, m2, m3 + REPX {pminsw x, m9}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + lea r6, [rsp+32*4] + call .main + vpbroadcastd m14, [pd_3072] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_3071 + call .pass1_rotations +.pass1_end: + REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp tx2q +.pass2: + call m(idct_16x8_internal_10bpc).transpose + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + vpbroadcastd m10, [pw_2048] + pxor m11, m11 + psubw m11, m10 + pmulhrsw m0, m10 + pmulhrsw m1, m11 + pmulhrsw m2, m10 + pmulhrsw m3, m11 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m11 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m11 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] + ret +ALIGN function_align +.main: + ; expects: m13 = clip_min m14 = clip_max + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 2] + pmulld m1, m15, [cq+32*13] + pmulld m2, m15, [cq+32* 6] + pmulld m3, m15, [cq+32* 9] + pmulld m4, m15, [cq+32*10] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32*14] + pmulld m7, m15, [cq+32* 1] + vpbroadcastd m12, [pd_2048] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + call .main_part1 + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32*15] + pmulld m2, m15, [cq+32* 4] + pmulld m3, m15, [cq+32*11] + pmulld m4, m15, [cq+32* 8] + pmulld m5, m15, [cq+32* 7] + pmulld m6, m15, [cq+32*12] + pmulld m7, m15, [cq+32* 3] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_part2: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380 + psubd m8, m0, m4 ; t8a + paddd m0, m4 ; t0a + psubd m4, m1, m5 ; t9a + paddd m1, m5 ; t1a + psubd m5, m2, m6 ; t12a + paddd m2, m6 ; t4a + psubd m6, m3, m7 ; t13a + paddd m7, m3 ; t5a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [pd_4017] + vpbroadcastd m10, [pd_799] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 + psubd m3, m0, m2 ; t4 + paddd m0, m2 ; t0 + psubd m2, m1, m7 ; t5 + paddd m1, m7 ; t1 + psubd m7, m4, m6 ; t12a + paddd m4, m6 ; t8a + psubd m6, m8, m5 ; t13a + paddd m5, m8 ; t9a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5 + vpbroadcastd m11, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11 + ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11 + pminsd m10, m14, [r6-32*4] ; t2 + pminsd m8, m14, [r6-32*3] ; t3 + psubd m9, m0, m10 ; t2a + paddd m0, m10 ; out0 + psubd m10, m1, m8 ; t3a + paddd m1, m8 ; -out15 + pmaxsd m9, m13 + pmaxsd m10, m13 + pminsd m9, m14 + pminsd m10, m14 + mova [r6-32*4], m1 + mova m11, [r6-32*1] ; t7a + mova m1, [r6-32*2] ; t6a + psubd m8, m3, m11 ; t7 + paddd m11, m3 ; out12 + paddd m3, m2, m1 ; -out3 + psubd m2, m1 ; t6 + pmaxsd m8, m13 + pmaxsd m2, m13 + pminsd m8, m14 + pminsd m2, m14 + mova [r6-32*1], m11 + mova [r6-32*3], m2 + mova m1, [r6+32*3] ; t15 + mova m2, [r6+32*2] ; t14 + paddd m12, m7, m1 ; -out13 + psubd m7, m1 ; t15a + psubd m11, m6, m2 ; t14a + paddd m2, m6 ; out2 + pmaxsd m7, m13 + pmaxsd m11, m13 + pminsd m7, m14 + pminsd m11, m14 + mova [r6-32*2], m12 + pminsd m1, m14, [r6+32*0] ; t10a + pminsd m12, m14, [r6+32*1] ; t11a + psubd m6, m4, m1 ; t10 + paddd m1, m4 ; -out1 + psubd m4, m5, m12 ; t11 + paddd m5, m12 ; out14 + vpbroadcastd m12, [pd_1448] + pmaxsd m6, m13 + pmaxsd m4, m13 + pminsd m6, m14 + pminsd m4, m14 + REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 + pmulld m12, [r6-32*3] ; t6 + mova [r6-32*3], m5 + paddd m5, m11, m7 ; -out5 (unshifted) + psubd m11, m7 ; out10 (unshifted) + paddd m7, m9, m10 ; -out7 (unshifted) + psubd m9, m10 ; out8 (unshifted) + psubd m10, m6, m4 ; -out9 (unshifted) + paddd m6, m4 ; out6 (unshifted) + paddd m4, m12, m8 ; out4 (unshifted) + psubd m12, m8 ; -out11 (unshifted) + ret +.main_part1: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601 + psubd m8, m0, m4 ; t10a + paddd m0, m4 ; t2a + psubd m4, m1, m5 ; t11a + paddd m1, m5 ; t3a + psubd m5, m2, m6 ; t14a + paddd m2, m6 ; t6a + psubd m6, m3, m7 ; t15a + paddd m7, m3 ; t7a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [pd_2276] + vpbroadcastd m10, [pd_3406] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 + psubd m3, m0, m2 ; t6 + paddd m0, m2 ; t2 + psubd m2, m1, m7 ; t7 + paddd m1, m7 ; t3 + psubd m7, m4, m6 ; t14a + paddd m4, m6 ; t10a + psubd m6, m8, m5 ; t15a + paddd m5, m8 ; t11a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later + vpbroadcastd m11, [pd_1567] + vpbroadcastd m10, [pd_3784] + ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + lea r6, [rsp+32*4] + call m(iadst_16x8_internal_10bpc).main + vpbroadcastd m14, [pd_3072] + psrld m15, 11 + psubd m13, m14, m15 + call .pass1_rotations + jmp m(iadst_16x8_internal_10bpc).pass1_end +.pass2: + call m(idct_16x8_internal_10bpc).transpose + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + vpbroadcastd m10, [pw_2048] + pxor m11, m11 + psubw m11, m10 + mova m12, m0 + pmulhrsw m0, m7, m11 + mova m7, m1 + pmulhrsw m1, m6, m10 + mova m6, m2 + pmulhrsw m2, m5, m11 + mova m5, m3 + pmulhrsw m3, m4, m10 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m5, m11 + pmulhrsw m1, m6, m10 + pmulhrsw m2, m7, m11 + pmulhrsw m3, m12, m10 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + ret + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32* 1] + pmulld m2, m15, [cq+32* 2] + pmulld m3, m15, [cq+32* 3] + pmulld m4, m15, [cq+32* 4] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32* 6] + pmulld m7, m15, [cq+32* 7] + pmulld m8, m15, [cq+32* 8] + pmulld m9, m15, [cq+32* 9] + pmulld m10, m15, [cq+32*10] + pmulld m11, m15, [cq+32*11] + pmulld m12, m15, [cq+32*12] + pmulld m13, m15, [cq+32*13] + pmulld m14, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [rsp], m7 + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + mova [rsp], m15 + vpbroadcastd m15, [pd_5793] + REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pmulld m15, [rsp] + mova [rsp], m7 + vpbroadcastd m7, [pd_3072] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_16x8_internal_10bpc).transpose + vpbroadcastd m10, [pw_4096] + jmp m(idct_16x8_internal_10bpc).end + +INV_TXFM_16X8_FN dct, dct, 12 +INV_TXFM_16X8_FN dct, identity, 12 +INV_TXFM_16X8_FN dct, adst, 12 +INV_TXFM_16X8_FN dct, flipadst, 12 + +cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_16x8_internal_10bpc).pass1 +.pass2: + call .pass2_main + RET +ALIGN function_align +.pass2_main: + call m(idct_8x16_internal_12bpc).transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m11, [pd_2048] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_12bpc).round_shift4 + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + pmaxsd m0, m12, [cq+32*0] + pmaxsd m1, m12, [cq+32*1] + pmaxsd m2, m12, [cq+32*2] + pmaxsd m3, m12, [cq+32*3] + pmaxsd m4, m12, [cq+32*4] + pmaxsd m5, m12, [cq+32*5] + pmaxsd m6, m12, [cq+32*6] + pmaxsd m7, m12, [cq+32*7] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_12bpc).round_shift4 +.end: + packssdw m0, [cq+32* 8] + packssdw m1, [cq+32* 9] + packssdw m2, [cq+32*10] + packssdw m3, [cq+32*11] + packssdw m4, [cq+32*12] + packssdw m5, [cq+32*13] + packssdw m6, [cq+32*14] + packssdw m7, [cq+32*15] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + call .write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + vpermq m2, m6, q3120 + vpermq m3, m7, q3120 + jmp m(idct_16x8_internal_10bpc).write_16x4_zero +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_12bpc_max] + lea r3, [strideq*3] + pxor m8, m8 + ret + +INV_TXFM_16X8_FN adst, dct, 12 +INV_TXFM_16X8_FN adst, adst, 12 +INV_TXFM_16X8_FN adst, flipadst, 12 +INV_TXFM_16X8_FN adst, identity, 12 + +cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iadst_16x8_internal_10bpc).pass1 +.pass2: + call .pass2_main + call m(idct_16x8_internal_12bpc).end + RET +ALIGN function_align +.pass2_main: + call m(idct_8x16_internal_12bpc).transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m11, [pd_2048] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_8x8_internal_12bpc).pass2_main2 + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + pmaxsd m0, m12, [cq+32*0] + pmaxsd m1, m12, [cq+32*1] + pmaxsd m2, m12, [cq+32*2] + pmaxsd m3, m12, [cq+32*3] + pmaxsd m4, m12, [cq+32*4] + pmaxsd m5, m12, [cq+32*5] + pmaxsd m6, m12, [cq+32*6] + pmaxsd m7, m12, [cq+32*7] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_8x8_internal_12bpc).pass2_main2 + ret + +INV_TXFM_16X8_FN flipadst, dct, 12 +INV_TXFM_16X8_FN flipadst, adst, 12 +INV_TXFM_16X8_FN flipadst, flipadst, 12 +INV_TXFM_16X8_FN flipadst, identity, 12 + +cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iflipadst_16x8_internal_10bpc).pass1 +.pass2: + call m(iadst_16x8_internal_12bpc).pass2_main + packssdw m13, m0, [cq+32* 8] + packssdw m12, m1, [cq+32* 9] + packssdw m11, m2, [cq+32*10] + packssdw m10, m3, [cq+32*11] + packssdw m3, m4, [cq+32*12] + packssdw m2, m5, [cq+32*13] + packssdw m1, m6, [cq+32*14] + packssdw m0, m7, [cq+32*15] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpermq m0, m10, q3120 + vpermq m1, m11, q3120 + vpermq m2, m12, q3120 + vpermq m3, m13, q3120 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET + +INV_TXFM_16X8_FN identity, dct, 12 +INV_TXFM_16X8_FN identity, adst, 12 +INV_TXFM_16X8_FN identity, flipadst, 12 +INV_TXFM_16X8_FN identity, identity, 12 + +cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_16x8_internal_10bpc).pass1 +.pass2: + call m(idct_16x8_internal_10bpc).transpose2 + vpbroadcastd m10, [pw_4096] + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + jmp m(idct_16x8_internal_10bpc).end2 + +%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 16x16, %4 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%4bpc] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity, 28 +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst + +cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + mova m10, [r6-32*4] + mova m9, [r6-32*3] + mova m8, [r6-32*2] + psubd m15, m0, m10 ; out15 + paddd m0, m10 ; out0 + psubd m10, m1, m9 ; out14 + paddd m1, m9 ; out1 + psubd m9, m2, m8 ; out13 + paddd m2, m8 ; out2 + REPX {psrad x, 2}, m0, m1, m2 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova m2, [r6-32*1] + mova m1, [r6+32*0] + mova m0, [r6+32*1] + REPX {psrad x, 2}, m9, m10, m15 + psubd m8, m3, m2 ; out12 + paddd m3, m2 ; out3 + psubd m2, m4, m1 ; out11 + paddd m4, m1 ; out4 + psubd m1, m5, m0 ; out10 + paddd m5, m0 ; out5 + REPX {psrad x, 2}, m3, m4, m5 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova m4, [r6+32*2] + mova m3, [r6+32*3] + REPX {psrad x, 2}, m1, m2, m8 + psubd m5, m6, m4 ; out9 + paddd m6, m4 ; out6 + psubd m4, m7, m3 ; out8 + paddd m7, m3 ; out7 + REPX {psrad x, 2}, m6, m7, m4, m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + mova [r6-32*4], m4 + mova [r6-32*3], m5 + mova [r6-32*2], m1 + mova [r6-32*1], m2 + mova [r6+32*0], m8 + mova [r6+32*1], m9 + mova [r6+32*2], m10 + mova [r6+32*3], m15 +.fast: + add r6, 32*8 + call .main + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + sub r6, 32*8 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + lea r6, [pw_5+128] + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] +.end: + call .write_16x16 + RET +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_2048] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_10bpc).write_16x4_start +.write_16x16_2: + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct_16x8_internal_10bpc).write_16x4_zero +ALIGN function_align +.transpose: + test eobd, eobd + jl .transpose_fast + packssdw m8, [r6-32*4] + packssdw m9, [r6-32*3] + packssdw m10, [r6-32*2] + packssdw m11, [r6-32*1] + packssdw m12, [r6+32*0] + packssdw m13, [r6+32*1] + packssdw m14, [r6+32*2] + packssdw m15, [r6+32*3] + sub r6, 32*8 + packssdw m0, [r6-32*4] + packssdw m1, [r6-32*3] + packssdw m2, [r6-32*2] + packssdw m3, [r6-32*1] + packssdw m4, [r6+32*0] + packssdw m5, [r6+32*1] + packssdw m6, [r6+32*2] + packssdw m7, [r6+32*3] + mova [r6], m8 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m6, m7 + punpcklwd m6, m7 + punpcklwd m7, m4, m5 + punpckhwd m4, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m7, m6 + punpckldq m7, m6 + punpckhdq m6, m4, m3 + punpckldq m4, m3 + punpckhqdq m3, m2, m1 + punpcklqdq m2, m1 + punpckhqdq m1, m0, m7 + punpcklqdq m0, m7 + punpcklqdq m7, m8, m6 + punpckhqdq m8, m6 + punpckhqdq m6, m5, m4 + punpcklqdq m5, m4 + mova m4, [r6] + mova [r6], m8 + punpcklwd m8, m4, m9 + punpckhwd m4, m9 + punpcklwd m9, m10, m11 + punpckhwd m10, m11 + punpckhwd m11, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m12, m13 + punpcklwd m12, m13 + punpckldq m13, m4, m10 + punpckhdq m4, m10 + punpckhdq m10, m8, m9 + punpckldq m8, m9 + punpckhdq m9, m12, m14 + punpckldq m12, m14 + punpckhdq m14, m15, m11 + punpckldq m15, m11 + punpckhqdq m11, m10, m9 + punpcklqdq m10, m9 + punpckhqdq m9, m8, m12 + punpcklqdq m8, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m4, m14 + punpcklqdq m14, m4, m14 + vperm2i128 m4, m0, m8, 0x31 + vinserti128 m0, xm8, 1 + vinserti128 m8, m5, xm12, 1 + vperm2i128 m12, m5, 0x13 + vperm2i128 m5, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vinserti128 m9, m6, xm13, 1 + vperm2i128 m13, m6, 0x13 + vperm2i128 m6, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vinserti128 m10, m7, xm14, 1 + vperm2i128 m14, m7, 0x13 + vperm2i128 m7, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + mova xm11, [r6] + vinserti128 m11, xm15, 1 + vinserti128 m15, [r6+16], 0 + ret +.transpose_fast: + call m(idct_16x8_internal_10bpc).transpose2 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + ret +ALIGN function_align +.main: + mova m0, [cq+64* 1] + mova m1, [cq+64* 3] + mova m2, [cq+64* 5] + mova m3, [cq+64* 7] + mova m4, [cq+64* 9] + mova m5, [cq+64*11] + mova m6, [cq+64*13] + mova m7, [cq+64*15] + call m(idct_8x16_internal_10bpc).main_oddhalf + mova m0, [cq+64* 0] + mova m1, [cq+64* 2] + mova m2, [cq+64* 4] + mova m3, [cq+64* 6] + mova m4, [cq+64* 8] + mova m5, [cq+64*10] + mova m6, [cq+64*12] + mova m7, [cq+64*14] + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + psrld m10, m11, 10 ; pd_2 + REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + vpbroadcastd m15, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + vpbroadcastd m8, [pd_5120] + paddd m4, m8 + paddd m6, m8 + paddd m9, m8 + paddd m11, m8 + vpbroadcastd m8, [pd_5119] + psubd m5, m8, m5 + psubd m7, m8, m7 + psubd m10, m8, m10 + psubd m12, m8, m12 + REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + psrld m4, m15, 10 ; pd_2 + paddd m0, m4 + psubd m1, m4, m1 + paddd m2, m4 + psubd m3, m4, m3 + psubd m7, m4, [r6-32*4] + paddd m6, m4, [r6-32*3] + psubd m5, m4, [r6-32*2] + paddd m4, [r6-32*1] + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + add r6, 32*8 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova [r6-32*2], m11 + mova [r6-32*1], m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 +.fast: + add r6, 32*8 + call .main + vpbroadcastd m14, [pd_5120] + vpbroadcastd m13, [pd_5119] + psrld m15, 10 ; pd_2 + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] +.pass1_end: + REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 + sub r6, 32*8 + jmp tx2q +.pass2: + call m(idct_16x16_internal_10bpc).transpose + lea r6, [pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end + mova [rsp+32*0], m8 + mova [rsp+32*2], m12 + mova [rsp+32*3], m13 + vpbroadcastd m12, [pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m12 + pmulhrsw m1, m13, [rsp+32*1] + mova [rsp+32*1], m9 + pmulhrsw m2, m12 + pmulhrsw m3, m13 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m12, m4 + pmulhrsw m1, m13, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m13, m7 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*0] + pmulhrsw m1, m13, [rsp+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m13, m11 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*2] + pmulhrsw m1, m13, [rsp+32*3] + pmulhrsw m2, m12, m14 + pmulhrsw m3, m13, m15 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET +ALIGN function_align +.main: + mova m0, [cq+64* 2] + mova m1, [cq+64*13] + mova m2, [cq+64* 6] + mova m3, [cq+64* 9] + mova m4, [cq+64*10] + mova m5, [cq+64* 5] + mova m6, [cq+64*14] + mova m7, [cq+64* 1] + vpbroadcastd m12, [pd_2048] + call m(iadst_16x8_internal_10bpc).main_part1 + mova m0, [cq+64* 0] + mova m1, [cq+64*15] + mova m2, [cq+64* 4] + mova m3, [cq+64*11] + mova m4, [cq+64* 8] + mova m5, [cq+64* 7] + mova m6, [cq+64*12] + mova m7, [cq+64* 3] + jmp m(iadst_16x8_internal_10bpc).main_part2 + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + vpbroadcastd m15, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call m(iadst_16x16_internal_10bpc).main + sub cq, 32 + vpbroadcastd m8, [pd_5120] + paddd m11, m8 + paddd m9, m8 + paddd m6, m8 + paddd m4, m8 + vpbroadcastd m8, [pd_5119] + psubd m12, m8, m12 + psubd m10, m8, m10 + psubd m7, m8, m7 + psubd m5, m8, m5 + REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 + mova [r6+32*0], m12 + mova [r6+32*1], m11 + mova [r6+32*2], m10 + mova [r6+32*3], m9 + psrld m9, m15, 10 ; pd_2 + psubd m3, m9, m3 + paddd m2, m9 + psubd m1, m9, m1 + paddd m0, m9 + psubd m12, m9, [r6-32*4] + paddd m11, m9, [r6-32*3] + psubd m10, m9, [r6-32*2] + paddd m9, [r6-32*1] + REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0 + mova [r6-32*4], m12 + mova [r6-32*3], m11 + mova [r6-32*2], m10 + mova [r6-32*1], m9 + add r6, 32*8 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 +.fast: + add r6, 32*8 + call m(iadst_16x16_internal_10bpc).main + vpbroadcastd m14, [pd_5120] + vpbroadcastd m13, [pd_5119] + psrld m15, 10 ; pd_2 + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + jmp m(iadst_16x16_internal_10bpc).pass1_end +.pass2: + call m(idct_16x16_internal_10bpc).transpose + lea r6, [pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end + mova [rsp+32*3], m3 + mova [rsp+32*2], m2 + mova [rsp+32*0], m0 + mova m2, m13 + mova m3, m12 + vpbroadcastd m12, [pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m13, m15 + pmulhrsw m1, m12, m14 + pmulhrsw m2, m13 + pmulhrsw m3, m12 + mova m14, m8 + mova m15, m9 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m13, m11 + pmulhrsw m1, m12, m10 + pmulhrsw m2, m13, m15 + pmulhrsw m3, m12, m14 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m13, m7 + pmulhrsw m1, m12, m6 + pmulhrsw m2, m13, m5 + pmulhrsw m3, m12, m4 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m13, [rsp+32*3] + pmulhrsw m1, m12, [rsp+32*2] + pmulhrsw m2, m13, [rsp+32*1] + pmulhrsw m3, m12, [rsp+32*0] + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET + +INV_TXFM_16X16_FN identity, dct, -92 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m15, [pd_5793] + vpbroadcastd m7, [pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + pmulld m0, m15, [cq+r3+32*33] + pmulld m1, m15, [cq+r3+32*35] + pmulld m2, m15, [cq+r3+32*37] + pmulld m3, m15, [cq+r3+32*39] + add r6, 32*4 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + pmulld m0, m15, [cq+64* 0] + pmulld m1, m15, [cq+64* 1] + pmulld m2, m15, [cq+64* 2] + pmulld m3, m15, [cq+64* 3] + pmulld m4, m15, [cq+64* 4] + pmulld m5, m15, [cq+64* 5] + pmulld m6, m15, [cq+64* 6] + pmulld m8, m15, [cq+64* 7] + mova [cq], m8 + pmulld m8, m15, [cq+64* 8] + pmulld m9, m15, [cq+64* 9] + pmulld m10, m15, [cq+64*10] + pmulld m11, m15, [cq+64*11] + pmulld m12, m15, [cq+64*12] + pmulld m13, m15, [cq+64*13] + pmulld m14, m15, [cq+64*14] + pmulld m15, [cq+64*15] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_16x16_internal_10bpc).transpose + + mova [cq+32*0], m15 + mova [cq+32*1], m0 + vpbroadcastd m15, [pw_1697x16] + + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [cq+32*1] + mova [cq+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [cq+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + mova m1, [cq+32*1] + jmp m(idct_16x16_internal_10bpc).end + +INV_TXFM_16X16_FN dct, dct, 0, 12 +INV_TXFM_16X16_FN dct, identity, 28, 12 +INV_TXFM_16X16_FN dct, adst, 0, 12 +INV_TXFM_16X16_FN dct, flipadst, 0, 12 + +cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_16x16_internal_10bpc).pass1 +.pass2: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + mova [cq+32*12], m12 + mova [cq+32*13], m13 + mova [cq+32*14], m14 + mova [cq+32*15], m15 + call .pass2_main + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] + mov r5, r6 + add r6, 32*16 + call .pass2_main + jmp m(iadst_16x16_internal_12bpc).end +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_16384] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + jmp m(idct_16x16_internal_10bpc).write_16x16_2 +ALIGN function_align +.pass2_main: + call m(idct_8x8_internal_12bpc).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m2 + mova [cq+32* 2], m4 + mova [cq+32* 3], m6 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, m1 + pmaxsd m1, m12, m3 + pmaxsd m2, m12, m5 + pmaxsd m3, m12, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3 + test eobd, eobd + jge .pass2_slow + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + jmp .pass2_fast +.pass2_slow: + sub r6, 32*8 + mova m8, [r6-32*4] + mova m4, [r6-32*3] + mova m10, [r6-32*2] + mova m5, [r6-32*1] + mova m12, [r6+32*0] + mova m6, [r6+32*1] + mova m14, [r6+32*2] + mova m7, [r6+32*3] + TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15 + mova [cq+32* 4], m8 + mova [cq+32* 5], m10 + mova [cq+32* 6], m12 + mova [cq+32* 7], m14 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m4, m5, m6, m7 + REPX {pminsd x, m13}, m4, m5, m6, m7 +.pass2_fast: + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 1] + pmaxsd m2, m12, [cq+32* 2] + pmaxsd m3, m12, [cq+32* 3] + REPX {pminsd x, m13}, m0, m1, m2, m3 + test eobd, eobd + jge .pass2_slow2 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + jmp .pass2_fast2 +.pass2_slow2: + pmaxsd m4, m12, [cq+32* 4] + pmaxsd m5, m12, [cq+32* 5] + pmaxsd m6, m12, [cq+32* 6] + pmaxsd m7, m12, [cq+32* 7] + REPX {pminsd x, m13}, m4, m5, m6, m7 +.pass2_fast2: + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + psrad m11, 8 ; pd_8 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_16x8_internal_10bpc).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ret + +INV_TXFM_16X16_FN adst, dct, 0, 12 +INV_TXFM_16X16_FN adst, adst, 0, 12 +INV_TXFM_16X16_FN adst, flipadst, 0, 12 + +cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iadst_16x16_internal_10bpc).pass1 +.pass2: + call .pass2_part1 + call m(iadst_16x8_internal_10bpc).pass1_rotations + call .pass2_part2 + call m(iadst_16x8_internal_10bpc).pass1_rotations +.pass2_part3: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 +.end: + packssdw m15, m14 + packssdw m14, m13, m12 + packssdw m13, m11, m10 + packssdw m12, m9, m8 + packssdw m11, m7, m6 + packssdw m10, m5, m4 + packssdw m7, m3, m2 + packssdw m6, m1, m0 + vpblendd m0, m6, [r5-32*4], 0x33 + vpblendd m1, m6, [r5-32*4], 0xcc + vpblendd m2, m7, [r5-32*3], 0x33 + vpblendd m3, m7, [r5-32*3], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpblendd m0, m10, [r5-32*2], 0x33 + vpblendd m1, m10, [r5-32*2], 0xcc + vpblendd m2, m11, [r5-32*1], 0x33 + vpblendd m3, m11, [r5-32*1], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpblendd m0, m12, [r5+32*0], 0x33 + vpblendd m1, m12, [r5+32*0], 0xcc + vpblendd m2, m13, [r5+32*1], 0x33 + vpblendd m3, m13, [r5+32*1], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpblendd m0, m14, [r5+32*2], 0x33 + vpblendd m1, m14, [r5+32*2], 0xcc + vpblendd m2, m15, [r5+32*3], 0x33 + vpblendd m3, m15, [r5+32*3], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET +ALIGN function_align +.pass2_part1: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + mova [cq+32*12], m12 + mova [cq+32*13], m13 + mova [cq+32*14], m14 + mova [cq+32*15], m15 +.pass2_main: + call m(idct_8x8_internal_12bpc).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m3 + mova [cq+32* 2], m4 + mova [cq+32* 3], m7 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + pmaxsd m0, m13, m2 + pmaxsd m2, m13, m6 + pmaxsd m5, m13, m5 + pmaxsd m7, m13, m1 + REPX {pminsd x, m14}, m0, m2, m5, m7 + test eobd, eobd + jge .pass2_slow + pxor m1, m1 + REPX {mova x, m1}, m3, m4, m6 + jmp .pass2_fast +.pass2_slow: + sub r6, 32*8 + mova m8, [r6-32*4] + mova m3, [r6-32*3] + mova m4, [r6-32*2] + mova m11, [r6-32*1] + mova m12, [r6+32*0] + mova m1, [r6+32*1] + mova m6, [r6+32*2] + mova m15, [r6+32*3] + TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14 + mova [cq+32* 4], m8 + mova [cq+32* 5], m11 + mova [cq+32* 6], m12 + mova [cq+32* 7], m15 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + REPX {pmaxsd x, m13}, m1, m3, m4, m6 + REPX {pminsd x, m14}, m1, m3, m4, m6 +.pass2_fast: + vpbroadcastd m12, [pd_2048] + vpbroadcastd m15, [pd_2896] + call m(iadst_16x8_internal_10bpc).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m7, m13, [cq+32* 1] ; 3 + pmaxsd m2, m13, [cq+32* 2] ; 4 + pmaxsd m5, m13, [cq+32* 3] ; 7 + REPX {pminsd x, m14}, m0, m2, m5, m7 + test eobd, eobd + jge .pass2_slow2 + pxor m1, m1 + REPX {mova x, m1}, m3, m4, m6 + jmp .pass2_fast2 +.pass2_slow2: + pmaxsd m4, m13, [cq+32* 4] ; 8 + pmaxsd m3, m13, [cq+32* 5] ; 11 + pmaxsd m6, m13, [cq+32* 6] ; 12 + pmaxsd m1, m13, [cq+32* 7] ; 15 + REPX {pminsd x, m14}, m1, m3, m4, m6 +.pass2_fast2: + call m(iadst_16x8_internal_10bpc).main_part2 + vpbroadcastd m14, [pd_17408] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_17407 + pslld m15, 3 ; pd_8 + ret +ALIGN function_align +.pass2_part2: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] + mov r5, r6 + add r6, 32*16 + jmp .pass2_main + +INV_TXFM_16X16_FN flipadst, dct, 0, 12 +INV_TXFM_16X16_FN flipadst, adst, 0, 12 +INV_TXFM_16X16_FN flipadst, flipadst, 0, 12 + +cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iflipadst_16x16_internal_10bpc).pass1 +.pass2: + call m(iadst_16x16_internal_12bpc).pass2_part1 + call m(iflipadst_16x8_internal_10bpc).pass1_rotations + call m(iadst_16x16_internal_12bpc).pass2_part2 + call m(iflipadst_16x8_internal_10bpc).pass1_rotations + jmp m(iadst_16x16_internal_12bpc).pass2_part3 + +INV_TXFM_16X16_FN identity, dct, -92, 12 +INV_TXFM_16X16_FN identity, identity, 0, 12 + +%macro IDTX16_12BPC 1 ; src + pmulld m6, m7, m%1 + paddd m6, m15 + psrad m6, 12 + paddd m6, m%1 + psrad m%1, m6, 1 +%endmacro + +cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m7, [pd_1697] + vpbroadcastd m15, [pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + mova m10, [cq+r3+32*33] + mova m11, [cq+r3+32*35] + mova m12, [cq+r3+32*37] + mova m13, [cq+r3+32*39] + add r6, 32*4 + pmulld m0, m7, m10 + pmulld m1, m7, m11 + pmulld m2, m7, m12 + pmulld m3, m7, m13 + REPX {paddd x, m15}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + paddd m0, m10 + paddd m1, m11 + paddd m2, m12 + paddd m3, m13 + REPX {psrad x, 1 }, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + mova m0, [cq+64* 0] + mova m1, [cq+64* 1] + mova m2, [cq+64* 2] + mova m3, [cq+64* 3] + mova m4, [cq+64* 4] + mova m5, [cq+64* 5] + mova m8, [cq+64* 6] + mova m9, [cq+64* 7] + REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9 + mova [cq+64*0], m8 + mova [cq+64*1], m9 + mova m8, [cq+64* 8] + mova m9, [cq+64* 9] + mova m10, [cq+64*10] + mova m11, [cq+64*11] + mova m12, [cq+64*12] + mova m13, [cq+64*13] + mova m14, [cq+64*14] + REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14 + mova m6, [cq+64*15] + pmulld m7, m6 + paddd m7, m15 + psrad m7, 12 + paddd m7, m6 + mova m6, [cq+64*0] + psrad m15, m7, 1 + mova m7, [cq+64*1] + jmp tx2q +.pass2: + call m(iidentity_8x16_internal_12bpc).pass2_main + call m(idct_16x16_internal_10bpc).transpose_fast + test eobd, eobd + jl .pass2_fast + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + mova m10, [r6-32*2] + mova m11, [r6-32*1] + mova m12, [r6+32*0] + mova m13, [r6+32*1] + mova m14, [r6+32*2] + mova m15, [r6+32*3] + sub r6, 32*8 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + mova m2, [r6-32*2] + mova m3, [r6-32*1] + mova m4, [r6+32*0] + mova m5, [r6+32*1] + mova m6, [r6+32*2] + mova m7, [r6+32*3] + call m(iidentity_8x16_internal_12bpc).pass2_main + call m(idct_16x8_internal_10bpc).transpose2 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 + mova m13, m5 + mova m14, m6 + mova m15, m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] +.pass2_fast: + call m(idct_16x16_internal_12bpc).write_16x16 + RET + +%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack + mova m%4, [r6+32*(%1-4)] + mova m%2, [r5+32*(3-%1)] + mova m%5, [r4+32*(%1-4)] + psubd m%3, m%1, m%4 ; idct16 out15 - n + paddd m%1, m%4 ; idct16 out0 + n + pmaxsd m%1, m12 + pmaxsd m%3, m12 + pminsd m%1, m13 + pminsd m%3, m13 + paddd m%1, m11 + paddd m%3, m11 + psubd m%4, m%1, m%2 ; out31 - n + paddd m%1, m%2 ; out0 + n + paddd m%2, m%3, m%5 ; out15 - n + psubd m%3, m%5 ; out16 + n + REPX {psrad x, %6}, m%1, m%3, m%2, m%4 +%if %7 & 1 + packssdw m%1, m%3 ; out0 + n, out16 + n + packssdw m%2, m%4 ; out15 - n, out31 - n +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vbroadcasti128 m14, [idct32_shuf] + mov r4, cq + call .pass1_main + mova [rsp+32*0], m2 + mova [rsp+32*1], m3 + cmp eobd, 43 + jge .eob43 + pxor m4, m4 + REPX {mova x, m4}, [rsp+32*2], m2, m3, m11 + jmp .pass1_end_fast +.eob43: + lea r6, [rsp+32*8] + mova [r6-32*4], m0 + mova [r6-32*3], m1 + call .pass1_main + mova [rsp+32*2], m2 + cmp eobd, 107 + jge .eob107 + mova m11, m3 + mova m2, m0 + mova m3, m1 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + pxor m4, m4 +.pass1_end_fast: + vpbroadcastd m10, [pw_2048] + lea r6, [deint_shuf+128] + REPX {mova x, m4}, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast + jmp .end +.eob107: + mova [rsp+32*3], m3 + mova [r6-32*2], m0 + mova [r6-32*1], m1 + call .pass1_main + cmp eobd, 171 + jge .eob171 + pshufd m12, m2, q1032 + pshufd m13, m3, q1032 + mova m4, m0 + mova m5, m1 + pxor m6, m6 + REPX {mova x, m6}, m7, m14, m15 + jmp .pass1_end +.eob171: + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + call .pass1_main + pshufd m12, [r6+32*2], q1032 ; out19 out17 + pshufd m13, [r6+32*3], q1032 ; out23 out21 + mova m4, [r6+32*0] ; out16 out18 + mova m5, [r6+32*1] ; out20 out22 + pshufd m14, m2, q1032 ; out27 out25 + pshufd m15, m3, q1032 ; out31 out29 + mova m6, m0 ; out24 out26 + mova m7, m1 ; out28 out30 +.pass1_end: + mova m0, [r6-32*4] ; out0 out2 + mova m1, [r6-32*3] ; out4 out6 + mova m2, [r6-32*2] ; out8 out10 + mova m3, [r6-32*1] ; out12 out14 + lea r6, [deint_shuf+128] + mova m11, [rsp+32*3] ; out13 out15 + vpbroadcastd m10, [pw_2048] + call m(inv_txfm_add_dct_dct_8x32_8bpc).main +.end: ; [rsp+0*32] = m12 + vpbroadcastd m12, [pw_2048] + mov cq, r4 + mova [rsp+32*1], m8 + mova [rsp+32*2], m9 + mova [rsp+32*3], m10 + mova [rsp+32*4], m11 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4_start + vpermq m0, m2, q3120 + vpermq m1, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [rsp+32*1], q3120 + vpermq m1, [rsp+32*2], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [rsp+32*3], q3120 + vpermq m1, [rsp+32*4], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [rsp+32*0], q3120 + vpermq m1, m13, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m14, q3120 + vpermq m1, m15, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +ALIGN function_align +.pass1_main_part1: + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + call m(idct_8x8_internal_10bpc).main + psrld m1, m11, 10 ; pd_2 + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret +ALIGN function_align +.pass1_main: + call .pass1_main_part1 + add cq, 32 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + pshufb m0, m14 + pshufb m2, m14 + pshufb m4, m14 + pshufb m6, m14 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + vperm2i128 m1, m0, m2, 0x31 ; 4 6 + vinserti128 m0, xm2, 1 ; 0 2 + vinserti128 m2, m3, xm4, 1 ; 1 3 + vperm2i128 m3, m4, 0x31 ; 5 7 + ret +.main_oddhalf_part1_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part1_fast: ; lower half zero + vpbroadcastd m7, [pd_4091] + vpbroadcastd m8, [pd_201] + vpbroadcastd m6, [pd_m1380] + vpbroadcastd m9, [pd_3857] + vpbroadcastd m5, [pd_3703] + vpbroadcastd m10, [pd_1751] + vpbroadcastd m4, [pd_m2751] + vpbroadcastd m15, [pd_3035] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part1_fast2 +.main_oddhalf_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a +.main_oddhalf_part1_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t17 + paddd m0, m4 ; t16 + psubd m4, m6, m2 ; t18 + paddd m6, m2 ; t19 + psubd m2, m1, m5 ; t29 + paddd m1, m5 ; t28 + psubd m5, m7, m3 ; t30 + paddd m7, m3 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [pd_4017] + vpbroadcastd m10, [pd_799] + ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a + psubd m3, m0, m6 ; t19a + paddd m0, m6 ; t16a + psubd m6, m7, m1 ; t28a + paddd m7, m1 ; t31a + psubd m1, m5, m4 ; t18 + paddd m5, m4 ; t17 + psubd m4, m8, m2 ; t29 + paddd m8, m2 ; t30 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a + ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 + mova [r6-32*4], m0 + mova [r6-32*3], m5 + mova [r6-32*2], m4 + mova [r6-32*1], m6 + mova [r6+32*0], m3 + mova [r6+32*1], m1 + mova [r6+32*2], m8 + mova [r6+32*3], m7 + ret +.main_oddhalf_part2_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part2_fast: ; lower half zero + vpbroadcastd m7, [pd_m601] + vpbroadcastd m8, [pd_4052] + vpbroadcastd m6, [pd_3973] + vpbroadcastd m9, [pd_995] + vpbroadcastd m5, [pd_m2106] + vpbroadcastd m10, [pd_3513] + vpbroadcastd m4, [pd_3290] + vpbroadcastd m15, [pd_2440] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part2_fast2 +.main_oddhalf_part2_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 + ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a + ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a +.main_oddhalf_part2_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t25 + paddd m0, m4 ; t24 + psubd m4, m6, m2 ; t26 + paddd m6, m2 ; t27 + psubd m2, m1, m5 ; t21 + paddd m1, m5 ; t20 + psubd m5, m7, m3 ; t22 + paddd m7, m3 ; t23 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [pd_2276] + vpbroadcastd m10, [pd_3406] + ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a + psubd m3, m0, m6 ; t27a + paddd m0, m6 ; t24a + psubd m6, m7, m1 ; t20a + paddd m7, m1 ; t23a + psubd m1, m5, m4 ; t21 + paddd m5, m4 ; t22 + psubd m4, m8, m2 ; t26 + paddd m8, m2 ; t25 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20 + mova m9, [r6-32*4] ; t16a + mova m10, [r6-32*3] ; t17 + psubd m2, m9, m7 ; t23 + paddd m9, m7 ; t16 + psubd m7, m10, m5 ; t22a + paddd m10, m5 ; t17a + REPX {pmaxsd x, m12}, m9, m10, m2, m7 + REPX {pminsd x, m13}, m9, m10, m2, m7 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova m9, [r6-32*2] ; t18a + mova m10, [r6-32*1] ; t19 + psubd m5, m9, m1 ; t21 + paddd m9, m1 ; t18 + psubd m1, m10, m6 ; t20a + paddd m10, m6 ; t19a + REPX {pmaxsd x, m12}, m9, m10, m5, m1 + REPX {pminsd x, m13}, m9, m10, m5, m1 + mova [r6-32*2], m9 + mova [r6-32*1], m10 + mova m9, [r6+32*0] ; t28 + mova m10, [r6+32*1] ; t29a + psubd m6, m9, m3 ; t27a + paddd m9, m3 ; t28a + psubd m3, m10, m4 ; t26 + paddd m10, m4 ; t29 + REPX {pmaxsd x, m12}, m9, m10, m6, m3 + REPX {pminsd x, m13}, m9, m10, m6, m3 + REPX {pmulld x, m14}, m6, m3, m1, m5 + paddd m6, m11 + paddd m3, m11 + psubd m4, m6, m1 ; t20 + paddd m6, m1 ; t27 + psubd m1, m3, m5 ; t21a + paddd m3, m5 ; t26a + REPX {psrad x, 12 }, m4, m1, m3, m6 + mova [r6+32*0], m4 + mova [r6+32*1], m1 + mova m4, [r6+32*2] ; t30 + mova m1, [r6+32*3] ; t31a + psubd m5, m4, m8 ; t25a + paddd m4, m8 ; t30a + psubd m8, m1, m0 ; t24 + paddd m1, m0 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m1 + REPX {pminsd x, m13}, m8, m5, m4, m1 + REPX {pmulld x, m14}, m5, m8, m7, m2 + paddd m5, m11 + paddd m8, m11 + psubd m0, m5, m7 ; t22 + paddd m5, m7 ; t25 + psubd m7, m8, m2 ; t23a + paddd m2, m8 ; t24a + REPX {psrad x, 12 }, m0, m7, m2, m5 + mova [r6+32*2], m0 + mova [r6+32*3], m7 + mov r4, r6 + add r6, 32*8 + mova [r6-32*4], m2 + mova [r6-32*3], m5 + mova [r6-32*2], m3 + mova [r6-32*1], m6 + mova [r6+32*0], m9 + mova [r6+32*1], m10 + mova [r6+32*2], m4 + mova [r6+32*3], m1 + mov r5, r6 + add r6, 32*8 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2 + IDCT32_END 1, 14, 8, 9, 10, 2 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 2 + IDCT32_END 3, 14, 8, 9, 10, 2 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 2 + IDCT32_END 5, 14, 8, 9, 10, 2 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 2 + IDCT32_END 7, 14, 8, 9, 10, 2 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 +.transpose: + punpckhdq m15, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m4, m6 + punpckldq m4, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpcklqdq m5, m2, m15 + punpckhqdq m2, m15 + punpckhqdq m15, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m6, m1 + punpcklqdq m6, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m7, 0x31 + vinserti128 m0, xm7, 1 + vperm2i128 m7, m3, m2, 0x31 + vinserti128 m3, xm2, 1 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m15, 0x31 + vinserti128 m1, xm15, 1 + ret + +cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_5] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {paddsw x, m5}, m0, m1, m2, m3 + REPX {psraw x, 3 }, m0, m1, m2, m3 + call .main_zero + add cq, 32 + lea dstq, [dstq+strideq*8] + sub eobd, 64 + jge .loop + RET +ALIGN function_align +.main_zero: + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m1 + punpcklwd m2, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + paddw m0, m4 + mova xm4, [dstq+strideq*1] + vinserti128 m4, [dstq+r5 ], 1 + paddw m1, m4 + mova xm4, [dstq+strideq*2] + vinserti128 m4, [dstq+r6*2 ], 1 + paddw m2, m4 + mova xm4, [dstq+r6 ] + vinserti128 m4, [dstq+r4 ], 1 + paddw m3, m4 + REPX {pmaxsw x, m6}, m0, m1, m2, m3 + REPX {pminsw x, m7}, m0, m1, m2, m3 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*4], m0, 1 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+r5 ], m1, 1 + mova [dstq+strideq*2], xm2 + vextracti128 [dstq+r6*2 ], m2, 1 + mova [dstq+r6 ], xm3 + vextracti128 [dstq+r4 ], m3, 1 + ret + +cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + mov r4, cq + lea r6, [rsp+32*4] + call .pass1_main + cmp eobd, 43 + jge .eob43 + jmp .pass2_fast +.eob43: + call .pass1_main + cmp eobd, 107 + jge .eob107 +.pass2_fast: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(idct_8x16_internal_10bpc).main_oddhalf_fast + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + jmp .pass2_end +.eob107: + call .pass1_main + cmp eobd, 171 + jge .eob171 + jmp .pass2 +.eob171: + call .pass1_main +.pass2: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + pmaxsd m4, m12, [cq+128*1+64] + pmaxsd m5, m12, [cq+128*7+64] + pmaxsd m6, m12, [cq+128*1+96] + pmaxsd m7, m12, [cq+128*7+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + pmaxsd m4, m12, [cq+128*3+64] + pmaxsd m5, m12, [cq+128*5+64] + pmaxsd m6, m12, [cq+128*3+96] + pmaxsd m7, m12, [cq+128*5+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + pmaxsd m4, m12, [cq+128*2+64] + pmaxsd m5, m12, [cq+128*6+64] + pmaxsd m6, m12, [cq+128*2+96] + pmaxsd m7, m12, [cq+128*6+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + pmaxsd m4, m12, [cq+128*0+64] + pmaxsd m5, m12, [cq+128*4+64] + pmaxsd m6, m12, [cq+128*0+96] + pmaxsd m7, m12, [cq+128*4+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf +.pass2_end: + psrld m11, 8 ; pd_8 + IDCT32_END 0, 15, 8, 9, 10, 4 + IDCT32_END 1, 14, 8, 9, 10, 4 + punpckhqdq m8, m0, m1 ; 16 17 (interleaved) + punpcklqdq m0, m1 ; 0 1 (interleaved) + punpcklqdq m1, m14, m15 ; 14 15 (interleaved) + punpckhqdq m14, m15 ; 30 31 (interleaved) + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 4 + IDCT32_END 3, 14, 8, 9, 10, 4 + punpckhqdq m8, m2, m3 ; 18 19 (interleaved) + punpcklqdq m2, m3 ; 2 3 (interleaved) + punpcklqdq m3, m14, m15 ; 12 13 (interleaved) + punpckhqdq m14, m15 ; 28 29 (interleaved) + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 4 + IDCT32_END 5, 14, 8, 9, 10, 4 + punpckhqdq m8, m4, m5 ; 20 21 (interleaved) + punpcklqdq m4, m5 ; 4 5 (interleaved) + punpcklqdq m5, m14, m15 ; 10 11 (interleaved) + punpckhqdq m14, m15 ; 26 27 (interleaved) + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 4 + IDCT32_END 7, 14, 8, 9, 10, 4 + punpckhqdq m8, m6, m7 ; 22 23 (interleaved) + punpcklqdq m6, m7 ; 6 7 (interleaved) + punpcklqdq m7, m14, m15 ; 8 9 (interleaved) + punpckhqdq m14, m15 ; 24 25 (interleaved) + mova [r5-32*3], m8 + mova [r5-32*4], m14 + mova m15, m1 +.end: + vpermq m0, m0, q3120 + vpermq m1, m2, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m6, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m7, q3120 + vpermq m1, m5, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m3, q3120 + vpermq m1, m15, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5+32*3], q3120 + vpermq m1, [r5+32*1], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5-32*1], q3120 + vpermq m1, [r5-32*3], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5-32*4], q3120 + vpermq m1, [r5-32*2], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5+32*0], q3120 + vpermq m1, [r5+32*2], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_12bpc] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +ALIGN function_align +.pass1_main: + call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1 + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 32 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2, 0 + mova [cq+32*16], m8 + mova [cq+32*31], m9 + IDCT32_END 1, 14, 8, 9, 10, 2, 0 + mova [cq+32*17], m8 + mova [cq+32*30], m9 + mova [cq+32*14], m14 + IDCT32_END 2, 14, 8, 9, 10, 2, 0 + mova [cq+32*18], m8 + mova [cq+32*29], m9 + mova [cq+32*13], m14 + IDCT32_END 3, 14, 8, 9, 10, 2, 0 + mova [cq+32*19], m8 + mova [cq+32*28], m9 + mova [cq+32*12], m14 + IDCT32_END 4, 14, 8, 9, 10, 2, 0 + mova [cq+32*20], m8 + mova [cq+32*27], m9 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + IDCT32_END 5, 10, 0, 1, 2, 2, 0 + mova [cq+32*21], m0 + mova [cq+32*26], m1 + IDCT32_END 6, 9, 0, 1, 2, 2, 0 + mova [cq+32*22], m0 + mova [cq+32*25], m1 + IDCT32_END 7, 8, 0, 1, 2, 2, 0 + mova [cq+32*23], m0 + mova [cq+32*24], m1 + mova m0, [cq+32* 0] + mova m1, [cq+32* 1] + mova m2, [cq+32* 2] + mova m11, m14 + mova m12, [cq+32*12] + mova m13, [cq+32*13] + mova m14, [cq+32*14] + ret + +cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1 + +cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 8 +.dconly: + add r6d, 640 + sar r6d, 10 +.dconly2: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm3 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + psubusw m1, m3 + psubusw m2, m3 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + call .pass1 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end + lea r6, [deint_shuf+128] + vpbroadcastd m11, [pw_2048] + mov r4, dstq + call .pass2 + mova m0, [r5+32*3] ; 16 17 + mova m1, [r5+32*2] ; 30 31 + mova m2, [r5+32*1] ; 18 19 + mova m3, [r5+32*0] ; 28 29 + mova m4, [r5-32*1] ; 20 21 + mova m5, [r5-32*2] ; 26 27 + mova m6, [r5-32*3] ; 22 23 + mova m7, [r5-32*4] ; 24 25 + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + lea dstq, [r4+32] + call .pass2 + RET +ALIGN function_align +.pass2: + call m(idct_16x8_internal_8bpc).main + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 + jmp m(idct_16x8_internal_10bpc).write_16x4_zero +ALIGN function_align +.pass1: + mova m0, [cq+32* 1] + mova m1, [cq+32* 7] + mova m2, [cq+32* 9] + mova m3, [cq+32*15] + mova m4, [cq+32*17] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32*31] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 + mova m0, [cq+32* 3] + mova m1, [cq+32* 5] + mova m2, [cq+32*11] + mova m3, [cq+32*13] + mova m4, [cq+32*19] + mova m5, [cq+32*21] + mova m6, [cq+32*27] + mova m7, [cq+32*29] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 + mova m0, [cq+32* 2] + mova m1, [cq+32* 6] + mova m2, [cq+32*10] + mova m3, [cq+32*14] + mova m4, [cq+32*18] + mova m5, [cq+32*22] + mova m6, [cq+32*26] + mova m7, [cq+32*30] + call m(idct_8x16_internal_10bpc).main_oddhalf + mova m0, [cq+32* 0] + mova m1, [cq+32* 4] + mova m2, [cq+32* 8] + mova m3, [cq+32*12] + mova m4, [cq+32*16] + mova m5, [cq+32*20] + mova m6, [cq+32*24] + mova m7, [cq+32*28] + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + ret + +cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_4096] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+32*0] + packssdw m0, [cq+32*1] + mova m1, [cq+32*2] + packssdw m1, [cq+32*3] + REPX {mova [cq+32*x], m6}, 0, 1, 2, 3 + add cq, 32*8 + mova m2, [cq-32*4] + packssdw m2, [cq-32*3] + mova m3, [cq-32*2] + packssdw m3, [cq-32*1] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 + call m(inv_txfm_add_identity_identity_8x32_10bpc).main + add dstq, 16 + sub eobd, 64 + jge .loop + RET + +cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_12bpc] + mov [cq], eobd ; 0 + or r3d, 8 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1 + call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end + mov r4, dstq + call m(idct_16x8_internal_12bpc).pass2_main + mova m0, [cq+32* 0] ; 16 + mova m1, [cq+32* 1] ; 17 + mova m2, [cq+32* 2] ; 18 + mova m3, [cq+32* 3] ; 19 + mova m4, [cq+32* 4] ; 20 + mova m5, [cq+32* 5] ; 21 + mova m6, [cq+32* 6] ; 22 + mova m7, [cq+32* 7] ; 23 + mova m8, [cq+32* 8] ; 24 + mova m9, [cq+32* 9] ; 25 + mova m10, [cq+32*10] ; 26 + mova m11, [cq+32*11] ; 27 + mova m12, [cq+32*12] ; 28 + mova m13, [cq+32*13] ; 29 + mova m14, [cq+32*14] ; 30 + mova m15, [cq+32*15] ; 31 + lea dstq, [r4+32] + call m(idct_16x8_internal_12bpc).pass2_main + RET + +cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1 + +%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 +%if %1 == 0 + pxor m6, m6 +%endif + pmulhrsw m%3, m15 + pmulhrsw m%1, m15 + paddw m%3, [dstq+%5] + paddw m%1, [r2+%6] + pmaxsw m%3, m6 + pmaxsw m%1, m6 + pminsw m%3, m7 + pminsw m%1, m7 + mova [dstq+%5], m%3 + mova [r2+%6], m%1 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*16] + lea r4, [r6+32*8] + lea r5, [r6+32*16] + call .main + sub eobd, 44 + jge .eob44 + vperm2i128 m2, m0, m3, 0x31 ; 5 + vinserti128 m0, xm3, 1 ; 1 + vperm2i128 m3, m1, m4, 0x31 ; 7 + vinserti128 m1, xm4, 1 ; 3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 + jmp .fast +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 +.eob44: + mova [r4+16*0], xm0 + mova [r4+16*1], xm3 + mova [r4+16*2], xm1 + mova [r4+16*3], xm4 + vextracti128 [r4+16*4], m0, 1 + vextracti128 [r4+16*5], m3, 1 + vextracti128 [r4+16*6], m1, 1 + vextracti128 [r4+16*7], m4, 1 + call .main + sub eobd, 107 + jge .eob151 + vperm2i128 m7, m1, m4, 0x31 ; 15 + vinserti128 m5, m1, xm4, 1 ; 11 + vperm2i128 m6, m0, m3, 0x31 ; 13 + vinserti128 m4, m0, xm3, 1 ; 9 + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] +.fast: + lea r6, [pw_5+128] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct16 +.eob151: + mova [r4-16*8], xm0 + mova [r4-16*7], xm3 + mova [r4-16*6], xm1 + mova [r4-16*5], xm4 + vextracti128 [r4-16*4], m0, 1 + vextracti128 [r4-16*3], m3, 1 + vextracti128 [r4-16*2], m1, 1 + vextracti128 [r4-16*1], m4, 1 + call .main + sub eobd, 128 + jge .eob279 + vperm2i128 m10, m0, m3, 0x31 ; 21 + vinserti128 m8, m0, xm3, 1 ; 17 + vperm2i128 m11, m1, m4, 0x31 ; 23 + vinserti128 m9, m1, xm4, 1 ; 19 + pxor m12, m12 + REPX {mova x, m12}, m13, m14, m15 + REPX {mova [r6+32*x], m12}, 0, 1, 2, 3 + jmp .full +.eob279: + mova [r5+16*0], xm0 + mova [r5+16*1], xm3 + mova [r5+16*2], xm1 + mova [r5+16*3], xm4 + vextracti128 [r5+16*4], m0, 1 + vextracti128 [r5+16*5], m3, 1 + vextracti128 [r5+16*6], m1, 1 + vextracti128 [r5+16*7], m4, 1 + call .main + vperm2i128 m14, m0, m3, 0x31 ; 29 + vinserti128 m12, m0, xm3, 1 ; 25 + vperm2i128 m15, m1, m4, 0x31 ; 31 + vinserti128 m13, m1, xm4, 1 ; 27 + mova m8, [r5+32*0] + mova m9, [r5+32*1] + mova m10, [r5+32*2] + mova m11, [r5+32*3] +.full: + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] + mova m4, [r4-32*4] + mova m5, [r4-32*3] + mova m6, [r4-32*2] + mova m7, [r4-32*1] + lea r6, [pw_5 + 128] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + lea r3, [rsp+32*8] + mova m8, [r3+32*0] + mova m9, [r3+32*1] + mova m10, [r3+32*2] + mova m11, [r3+32*3] + mova m12, [r3-32*4] + mova m13, [r3-32*3] + mova m14, [r3-32*2] + mova m15, [r3-32*1] +.idct16: + lea r3, [rsp+32*16] + mova m0, [r3+32*0] + mova m1, [r3+32*1] + mova m2, [r3+32*2] + mova m3, [r3+32*3] + mova m4, [r3-32*4] + mova m5, [r3-32*3] + mova m6, [r3-32*2] + mova m7, [r3-32*1] + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +.main: + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 3] + pmulld m2, m14, [cq+128* 5] + pmulld m3, m14, [cq+128* 7] + pmulld m4, m14, [cq+128* 9] + pmulld m5, m14, [cq+128*11] + pmulld m6, m14, [cq+128*13] + pmulld m7, m14, [cq+128*15] + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 2] + pmulld m2, m14, [cq+128* 4] + pmulld m3, m14, [cq+128* 6] + pmulld m4, m14, [cq+128* 8] + pmulld m5, m14, [cq+128*10] + pmulld m6, m14, [cq+128*12] + pmulld m7, m14, [cq+128*14] + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf + psrld m15, m11, 11 ; pd_1 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*2] + paddd m15, m1, m9 ; out1 + psubd m1, m9 ; out14 + mova m9, [r6-32*1] + REPX {psrad x, 1}, m0, m15, m10, m1 + packssdw m0, m15 + packssdw m1, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6+32*0] + paddd m15, m3, m9 ; out3 + psubd m3, m9 ; out12 + mova m9, [r6+32*1] + REPX {psrad x, 1}, m2, m15, m10, m3 + packssdw m2, m15 + packssdw m3, m10 + psubd m10, m4, m8 ; out11 + paddd m4, m8 ; out4 + mova m8, [r6+32*2] + paddd m15, m5, m9 ; out5 + psubd m5, m9 ; out10 + mova m9, [r6+32*3] + REPX {psrad x, 1}, m4, m10, m15, m5 + packssdw m4, m15 + packssdw m5, m10 + psubd m10, m6, m8 ; out9 + paddd m6, m8 ; out6 + paddd m15, m7, m9 ; out7 + psubd m7, m9 ; out8 + REPX {psrad x, 1}, m6, m10, m15, m7 + packssdw m6, m15 + packssdw m7, m10 + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m4, m6 + punpcklwd m4, m6 + punpcklwd m6, m7, m5 + punpckhwd m7, m5 + pxor m5, m5 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m5 + mova [cq+r7+128*0], m5 + mova [cq+r7+128*1], m5 + mova [cq+r7+128*2], m5 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + punpcklwd m5, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m1 + punpckhwd m4, m1 + punpckhwd m1, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m6, m7 + punpcklwd m6, m7 + punpcklqdq m7, m1, m4 + punpckhqdq m1, m4 + punpckhqdq m4, m8, m3 + punpcklqdq m8, m3 + punpckhqdq m3, m6, m5 + punpcklqdq m6, m5 + punpcklqdq m5, m0, m2 + punpckhqdq m0, m2 + mova [r6+16*0], xm5 + mova [r6+16*1], xm6 + mova [r6+16*2], xm7 + mova [r6+16*3], xm8 + vextracti128 [r6+16*4], m5, 1 + vextracti128 [r6+16*5], m6, 1 + vextracti128 [r6+16*6], m7, 1 + vextracti128 [r6+16*7], m8, 1 + sub r6, 32*4 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*2], m7 + mova [rsp+gprsize+32*3], m15 + vpbroadcastd m15, [pw_2048] + vpbroadcastd m7, [pixel_10bpc_max] + IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 + IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 + IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*0] + IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*2] + mova m2, [rsp+gprsize+32*3] + IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 + ret + +cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m8, [pw_2896x8] + vpbroadcastd m9, [pw_1697x16] + vpbroadcastd m11, [pw_8192] + lea r6, [strideq*5] + pxor m6, m6 + paddw m10, m11, m11 ; pw_16384 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main2: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpcklwd m4, m2, m1 + punpckhwd m2, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + call m(iidentity_8x8_internal_10bpc).write_2x8x2 + punpcklqdq m0, m3, m2 + punpckhqdq m1, m3, m2 + jmp m(iidentity_8x8_internal_10bpc).write_2x8x2 + +cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1 + +cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*4] + call .main + cmp eobd, 36 + jge .full + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + lea r6, [pw_5+128] + mov r7, dstq + call m(idct_16x16_internal_8bpc).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + jmp .end +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 +.full: + add cq, 32 + mova [r4+32*3], m0 + mova [r4+32*2], m1 + mova [r4+32*1], m2 + mova [r4+32*0], m3 + mova [r4-32*1], m4 + mova [r4-32*2], m5 + mova [r4-32*3], m6 + mova [r4-32*4], m7 + call .main + sub r4, 32*16 ; topleft 16x8 + call .transpose_16x16 + lea r6, [pw_5+128] + mov r7, dstq + call m(idct_16x16_internal_8bpc).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + add r4, 32*8 ; bottomleft 16x8 + call .transpose_16x16 +.end: + lea dstq, [r7+32] + call m(idct_16x16_internal_8bpc).main + call .write_16x16 + RET +ALIGN function_align +.transpose_16x16: + punpckhdq m8, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpckhdq m5, m4, m6 + punpckldq m4, m6 + punpckhqdq m6, m0, m4 + punpcklqdq m0, m4 + punpckhqdq m4, m1, m5 + punpcklqdq m1, m5 + punpckhqdq m5, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m2, m8 + punpcklqdq m2, m8 + vinserti128 m8, m0, xm7, 1 + vperm2i128 m12, m0, m7, 0x31 + vinserti128 m9, m6, xm5, 1 + vperm2i128 m13, m6, m5, 0x31 + vinserti128 m10, m1, xm2, 1 + vperm2i128 m14, m1, m2, 0x31 + vinserti128 m11, m4, xm3, 1 + vperm2i128 m15, m4, m3, 0x31 + mova m0, [r4+32*3] + mova m1, [r4+32*2] + mova m2, [r4+32*1] + mova m3, [r4+32*0] + mova m4, [r4-32*1] + mova m5, [r4-32*2] + mova m6, [r4-32*3] + mova m7, [r4-32*4] + mova [rsp+gprsize], m15 + jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose +ALIGN function_align +.main: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + pmulld m0, m14, [cq+64* 1] + pmulld m1, m14, [cq+64* 7] + pmulld m2, m14, [cq+64* 9] + pmulld m3, m14, [cq+64*15] + pmulld m4, m14, [cq+64*17] + pmulld m5, m14, [cq+64*23] + pmulld m6, m14, [cq+64*25] + pmulld m7, m14, [cq+64*31] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+64* 3] + pmulld m1, m14, [cq+64* 5] + pmulld m2, m14, [cq+64*11] + pmulld m3, m14, [cq+64*13] + pmulld m4, m14, [cq+64*19] + pmulld m5, m14, [cq+64*21] + pmulld m6, m14, [cq+64*27] + pmulld m7, m14, [cq+64*29] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+64* 2] + pmulld m1, m14, [cq+64* 6] + pmulld m2, m14, [cq+64*10] + pmulld m3, m14, [cq+64*14] + pmulld m4, m14, [cq+64*18] + pmulld m5, m14, [cq+64*22] + pmulld m6, m14, [cq+64*26] + pmulld m7, m14, [cq+64*30] + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+64* 0] + pmulld m1, m14, [cq+64* 4] + pmulld m2, m14, [cq+64* 8] + pmulld m3, m14, [cq+64*12] + pmulld m4, m14, [cq+64*16] + pmulld m5, m14, [cq+64*20] + pmulld m6, m14, [cq+64*24] + pmulld m7, m14, [cq+64*28] + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf + pxor m8, m8 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m8 + mova [cq+r7-64*1], m8 + mova [cq+r7+64*0], m8 + mova [cq+r7+64*1], m8 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m11, 11 ; pd_1 + IDCT32_END 0, 15, 8, 9, 10, 1 + IDCT32_END 1, 14, 8, 9, 10, 1 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 1 + IDCT32_END 3, 14, 8, 9, 10, 1 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 1 + IDCT32_END 5, 14, 8, 9, 10, 1 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 1 + IDCT32_END 7, 14, 8, 9, 10, 1 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 + ret +ALIGN function_align +.write_16x16: + mova m1, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_2048] + vpbroadcastd m9, [pixel_10bpc_max] + lea r3, [strideq*3] + pxor m8, m8 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct_16x8_internal_10bpc).write_16x4 + +cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m8, [pw_2896x8] + vpbroadcastd m9, [pw_1697x16] + vpbroadcastd m10, [pw_4096] + lea r6, [strideq*5] + pxor m6, m6 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*1] + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*2] + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*3] + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {paddsw x, x }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3 + REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 + +cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1 + +cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly +.fast: + lea r4, [rsp+32*71] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r3, [rsp+32*3] + mov r4, r6 + lea r5, [r6+32*8] + lea r6, [pw_5+128] + call .pass2_oddhalf + call .pass2_evenhalf + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + lea r3, [rsp+32*11] + call .pass2_oddhalf + call .pass2_evenhalf + lea r3, [strideq*3] + call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 7] + mova m2, [cq+128* 9] + mova m3, [cq+128*15] + mova m4, [cq+128*17] + mova m5, [cq+128*23] + mova m6, [cq+128*25] + mova m7, [cq+128*31] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128* 5] + mova m2, [cq+128*11] + mova m3, [cq+128*13] + mova m4, [cq+128*19] + mova m5, [cq+128*21] + mova m6, [cq+128*27] + mova m7, [cq+128*29] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128* 6] + mova m2, [cq+128*10] + mova m3, [cq+128*14] + mova m4, [cq+128*18] + mova m5, [cq+128*22] + mova m6, [cq+128*26] + mova m7, [cq+128*30] + call m(idct_8x16_internal_10bpc).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 4] + mova m2, [cq+128* 8] + mova m3, [cq+128*12] + mova m4, [cq+128*16] + mova m5, [cq+128*20] + mova m6, [cq+128*24] + mova m7, [cq+128*28] + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret +ALIGN function_align +.pass2_oddhalf: + mova m0, [r3+32* 1] ; 1 + mova m1, [r3+32* 3] ; 3 + mova m2, [r3+32* 5] ; 5 + mova m3, [r3+32* 7] ; 7 + mova m4, [r3+32*17] ; 9 + mova m5, [r3+32*19] ; 11 + mova m6, [r3+32*21] ; 13 + mova m7, [r3+32*23] ; 15 + mova m8, [r3+32*33] ; 17 + mova m9, [r3+32*35] ; 19 + mova m10, [r3+32*37] ; 21 + mova m11, [r3+32*39] ; 23 + mova m12, [r3+32*49] ; 25 + mova m13, [r3+32*51] ; 27 + mova m14, [r3+32*53] ; 29 + mova m15, [r3+32*55] ; 31 + jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf +ALIGN function_align +.pass2_evenhalf: + mova m0, [r3+32* 0] ; 0 + mova m1, [r3+32* 2] ; 2 + mova m2, [r3+32* 4] ; 4 + mova m3, [r3+32* 6] ; 6 + mova m4, [r3+32*16] ; 8 + mova m5, [r3+32*18] ; 10 + mova m6, [r3+32*20] ; 12 + mova m7, [r3+32*22] ; 14 + mova m8, [r3+32*32] ; 16 + mova m9, [r3+32*34] ; 18 + mova m10, [r3+32*36] ; 20 + mova m11, [r3+32*38] ; 22 + mova m12, [r3+32*48] ; 24 + mova m13, [r3+32*50] ; 26 + mova m14, [r3+32*52] ; 28 + mova m15, [r3+32*54] ; 30 + mova [rsp+gprsize], m15 + jmp m(idct_16x16_internal_8bpc).main + +cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob +%undef cmp + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_8192] + pxor m6, m6 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 + call .main ; 0 + cmp eobd, 36 + jl .ret + add cq, 128*8 ; 0 1 + mov r7, dstq ; 1 + add dstq, 16 + call .main + call .main2 + cmp eobd, 136 + jl .ret + add cq, 128*16-32 ; 0 1 2 + lea dstq, [r7+16*2] ; 1 2 + call .main ; 2 + call .main2 + call .main2 + cmp eobd, 300 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + add r7, 16*3 ; 1 2 3 + mov dstq, r7 ; 2 3 + call .main ; 3 + call .main2 + call .main2 + call .main2 + cmp eobd, 535 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + mov r7, dstq ; 2 3 4 + call .main ; 3 4 + call .main2 + call .main2 + cmp eobd, 755 + jl .ret + add cq, 128*16-32 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + call .main ; 2 3 4 5 + call .main2 ; 3 4 5 + cmp eobd, 911 + jl .ret + add cq, 128*8 ; 0 1 2 3 + add dstq, 16 ; 1 2 3 4 + call .main ; 2 3 4 5 +.ret: ; 3 4 5 6 + RET +ALIGN function_align +.main2: + sub cq, 128*8-32 + lea dstq, [dstq+strideq*8-16] +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero + +cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1 + +%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n + mova m%4, [r4-32*(14+%1)] ; idct32 out31-n +%else + mova m%5, [r4-32*(45-%1)] + mova m%4, [r5-32*(20+%1)] +%endif + paddsw m%6, m%5, m%4 ; idct32 out 0+n + psubsw m%5, m%4 ; idct32 out31-n + paddsw m%4, m%5, m%3 ; out31-n + psubsw m%5, m%3 ; out32+n + paddsw m%3, m%6, m%2 ; out 0+n + psubsw m%6, m%2 ; out63-n + REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + paddw m%3, [%%d0+%7 ] + paddw m%4, [%%d1+%8 ] + paddw m%5, [%%d0+%9 ] + paddw m%6, [%%d1+%10] + pxor m%2, m%2 + REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 + vpbroadcastd m%2, [pixel_10bpc_max] + REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 + mova [%%d0+%7 ], m%3 + mova [%%d1+%8 ], m%4 + mova [%%d0+%9 ], m%5 + mova [%%d1+%10], m%6 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*6] + call .main + sub eobd, 44 + jl .fast + call .main + sub eobd, 107 + jl .fast + call .main + sub eobd, 128 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 64 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 +.fast: + lea r4, [rsp+32*38] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r6, [pw_5+128] + mova m0, [rsp+32* 2] ; in0 + mova m1, [rsp+32* 6] ; in4 + mova m2, [rsp+32*10] ; in8 + mova m3, [rsp+32*14] ; in12 + mova m4, [rsp+32*18] ; in16 + mova m5, [rsp+32*22] ; in20 + mova m6, [rsp+32*26] ; in24 + mova m7, [rsp+32*30] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*38] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [rsp+32* 4] ; in2 + mova m1, [rsp+32* 8] ; in6 + mova m2, [rsp+32*12] ; in10 + mova m3, [rsp+32*16] ; in14 + mova m4, [rsp+32*20] ; in18 + mova m5, [rsp+32*24] ; in22 + mova m6, [rsp+32*28] ; in26 + mova m7, [rsp+32*32] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova m0, [rsp+32* 3] ; in1 + mova m1, [rsp+32*33] ; in31 + mova m2, [rsp+32*19] ; in17 + mova m3, [rsp+32*17] ; in15 + mova m4, [rsp+32*11] ; in9 + mova m5, [rsp+32*25] ; in23 + mova m6, [rsp+32*27] ; in25 + mova m7, [rsp+32* 9] ; in7 + lea r6, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + mova m0, [rsp+32* 7] ; in5 + mova m1, [rsp+32*29] ; in27 + mova m2, [rsp+32*23] ; in21 + mova m3, [rsp+32*13] ; in11 + mova m4, [rsp+32*15] ; in13 + mova m5, [rsp+32*21] ; in19 + mova m6, [rsp+32*31] ; in29 + mova m7, [rsp+32* 5] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + call .main_part2_pass2 + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + call m(idct_8x16_internal_10bpc).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + pxor m15, m15 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + psrld m15, m11, 10 ; pd_2 + mova m8, [r6-32*4] + mova m9, [r6+32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*3] + psubd m15, m7, m9 ; out8 + paddd m7, m9 ; out7 + mova m9, [r6+32*2] + REPX {psrad x, 2}, m0, m15, m10, m7 + packssdw m0, m15 + packssdw m7, m10 + psubd m10, m1, m8 ; out14 + paddd m1, m8 ; out1 + mova m8, [r6-32*2] + psubd m15, m6, m9 ; out9 + paddd m6, m9 ; out6 + mova m9, [r6+32*1] + REPX {psrad x, 2}, m1, m15, m10, m6 + packssdw m1, m15 + packssdw m6, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6-32*1] + psubd m15, m5, m9 ; out10 + paddd m5, m9 ; out5 + mova m9, [r6+32*0] + REPX {psrad x, 2}, m2, m15, m10, m5 + packssdw m2, m15 + packssdw m5, m10 + psubd m10, m3, m8 ; out12 + paddd m3, m8 ; out3 + psubd m15, m4, m9 ; out11 + paddd m4, m9 ; out4 + REPX {psrad x, 2}, m3, m15, m10, m4 + packssdw m3, m15 + packssdw m4, m10 + call m(idct_16x8_internal_10bpc).transpose3 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + ret +.main_part2_pass2: + vpbroadcastd m11, [pw_1567_3784] + vpbroadcastd m12, [pw_m3784_1567] + vpbroadcastd m13, [pw_2896_2896] + lea r6, [pw_5+128] + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [pw_m2896_2896] + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal + vpbroadcastd m14, [pw_2048] + IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp r4, r5 + jne .main_part2_pass2_loop + ret +ALIGN function_align +.main_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_part1: ; idct64 steps 1-5 + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vpbroadcastd m7, [r5+4*0] + vpbroadcastd m8, [r5+4*1] + vpbroadcastd m6, [r5+4*2] + vpbroadcastd m9, [r5+4*3] + vpbroadcastd m5, [r5+4*4] + vpbroadcastd m10, [r5+4*5] + vpbroadcastd m4, [r5+4*6] + vpbroadcastd m15, [r5+4*7] + pmulld m7, m0 ; t63a + pmulld m0, m8 ; t32a + pmulld m6, m1 ; t62a + pmulld m1, m9 ; t33a + pmulld m5, m2 ; t61a + pmulld m2, m10 ; t34a + pmulld m4, m3 ; t60a + pmulld m3, m15 ; t35a + vpbroadcastd m10, [r5+4*8] + vpbroadcastd m15, [r5+4*9] + REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 + REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 + psubd m8, m0, m1 ; t33 + paddd m0, m1 ; t32 + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m5 ; t61 + paddd m4, m5 ; t60 + REPX {pmaxsd x, m12}, m8, m1, m6, m2 + REPX {pminsd x, m13}, m8, m1, m6, m2 + ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a + REPX {pmaxsd x, m12}, m0, m3, m7, m4 + REPX {pminsd x, m13}, m0, m3, m7, m4 + vpbroadcastd m10, [r5+4*10] + vpbroadcastd m15, [r5+4*11] + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m8, m2 ; t61 + paddd m8, m2 ; t62 + REPX {pmaxsd x, m12}, m5, m3, m4, m6 + REPX {pminsd x, m13}, m5, m3, m4, m6 + ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 + ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a + REPX {pmaxsd x, m12}, m0, m7, m1, m8 + REPX {pminsd x, m13}, m0, m7, m1, m8 + add r5, 4*12 + mova [r6-32*4], m0 + mova [r6+32*3], m7 + mova [r6-32*3], m1 + mova [r6+32*2], m8 + mova [r6-32*2], m6 + mova [r6+32*1], m4 + mova [r6-32*1], m3 + mova [r6+32*0], m5 + add r6, 32*8 + ret +.main_part2: ; idct64 steps 6-9 + lea r5, [r6+32*3] + sub r6, 32*4 + vpbroadcastd m10, [pd_1567] + vpbroadcastd m15, [pd_3784] +.main_part2_loop: + mova m0, [r6-32*32] ; t32a + mova m1, [r5-32*24] ; t39a + mova m2, [r5-32*32] ; t63a + mova m3, [r6-32*24] ; t56a + mova m4, [r6-32*16] ; t40a + mova m5, [r5-32* 8] ; t47a + mova m6, [r5-32*16] ; t55a + mova m7, [r6-32* 8] ; t48a + psubd m8, m0, m1 ; t39 + paddd m0, m1 ; t32 + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + REPX {pmaxsd x, m12}, m8, m1, m3, m4 + REPX {pminsd x, m13}, m8, m1, m3, m4 + ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a + REPX {pmaxsd x, m12}, m0, m2, m5, m7 + REPX {pminsd x, m13}, m0, m5, m2, m7 + psubd m6, m2, m7 ; t48a + paddd m2, m7 ; t63a + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m8, m4 ; t55 + paddd m8, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + REPX {pmaxsd x, m12}, m6, m7, m5, m4 + REPX {pminsd x, m13}, m6, m7, m5, m4 + REPX {pmulld x, m14}, m6, m7, m5, m4 + REPX {pmaxsd x, m12}, m2, m0, m8, m1 + REPX {pminsd x, m13}, m2, m0, m8, m1 + paddd m6, m11 + paddd m5, m11 + psubd m3, m6, m7 ; t47 + paddd m6, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m3, m6, m7, m5 + mova [r5-32* 8], m2 + mova [r6-32*32], m0 + mova [r6-32* 8], m8 + mova [r5-32*32], m1 + mova [r5-32*24], m3 + mova [r6-32*16], m6 + mova [r6-32*24], m7 + mova [r5-32*16], m5 + add r6, 32 + sub r5, 32 + cmp r6, r5 + jl .main_part2_loop + ret + +cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*6] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 64 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 +.fast: + lea r4, [rsp+32*70] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r6, [pw_5 + 128] + mov r10, rsp + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10+32* 2] ; in0 + mova m1, [r10+32* 6] ; in4 + mova m2, [r10+32*18] ; in8 + mova m3, [r10+32*22] ; in12 + mova m4, [r10+32*34] ; in16 + mova m5, [r10+32*38] ; in20 + mova m6, [r10+32*50] ; in24 + mova m7, [r10+32*54] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*70] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10+32* 4] ; in2 + mova m1, [r10+32* 8] ; in6 + mova m2, [r10+32*20] ; in10 + mova m3, [r10+32*24] ; in14 + mova m4, [r10+32*36] ; in18 + mova m5, [r10+32*40] ; in22 + mova m6, [r10+32*52] ; in26 + mova m7, [r10+32*56] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova m0, [r10+32* 3] ; in1 + mova m1, [r10+32*57] ; in31 + mova m2, [r10+32*35] ; in17 + mova m3, [r10+32*25] ; in15 + mova m4, [r10+32*19] ; in9 + mova m5, [r10+32*41] ; in23 + mova m6, [r10+32*51] ; in25 + mova m7, [r10+32* 9] ; in7 + lea r6, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + mova m0, [r10+32* 7] ; in5 + mova m1, [r10+32*53] ; in27 + mova m2, [r10+32*39] ; in21 + mova m3, [r10+32*21] ; in11 + mova m4, [r10+32*23] ; in13 + mova m5, [r10+32*37] ; in19 + mova m6, [r10+32*55] ; in29 + mova m7, [r10+32* 5] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 + add r10, 32*8 + sub r4, 32*98 ; rsp+32*16 + sub dstq, r8 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 7] + pmulld m2, m14, [cq+128* 9] + pmulld m3, m14, [cq+128*15] + pmulld m4, m14, [cq+128*17] + pmulld m5, m14, [cq+128*23] + pmulld m6, m14, [cq+128*25] + pmulld m7, m14, [cq+128*31] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128* 5] + pmulld m2, m14, [cq+128*11] + pmulld m3, m14, [cq+128*13] + pmulld m4, m14, [cq+128*19] + pmulld m5, m14, [cq+128*21] + pmulld m6, m14, [cq+128*27] + pmulld m7, m14, [cq+128*29] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128* 6] + pmulld m2, m14, [cq+128*10] + pmulld m3, m14, [cq+128*14] + pmulld m4, m14, [cq+128*18] + pmulld m5, m14, [cq+128*22] + pmulld m6, m14, [cq+128*26] + pmulld m7, m14, [cq+128*30] + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 4] + pmulld m2, m14, [cq+128* 8] + pmulld m3, m14, [cq+128*12] + pmulld m4, m14, [cq+128*16] + pmulld m5, m14, [cq+128*20] + pmulld m6, m14, [cq+128*24] + pmulld m7, m14, [cq+128*28] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret + +cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .normal + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 16 +.dconly: + add r6d, 640 + sar r6d, 10 +.dconly2: + vpbroadcastd m5, [dconly_10bpc] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm5 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + paddsw m3, m0, [dstq+32*2] + paddsw m4, m0, [dstq+32*3] + REPX {psubusw x, m5}, m1, m2, m3, m4 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + mova [dstq+32*2], m3 + mova [dstq+32*3], m4 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*4] + call .main + call .shift_transpose + cmp eobd, 36 + jl .fast + call .main + call .shift_transpose + jmp .pass2 +.fast: + pxor m0, m0 + mov r3d, 4 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + dec r3d + jg .fast_loop +.pass2: + lea r7, [r6-32*64] + lea r4, [r6-32*32] + lea r6, [pw_5+128] + mov r5, dstq +.pass2_loop: + mova m0, [r7-32*4] + mova m1, [r7-32*3] + mova m2, [r7-32*2] + mova m3, [r7-32*1] + mova m4, [r7+32*0] + mova m5, [r7+32*1] + mova m6, [r7+32*2] + mova m7, [r7+32*3] + add r7, 32*32 + mova m8, [r7-32*4] + mova m9, [r7-32*3] + mova m10, [r7-32*2] + mova m11, [r7-32*1] + mova m12, [r7+32*0] + mova m13, [r7+32*1] + mova m14, [r7+32*2] + mova m15, [r7+32*3] + sub r7, 32*24 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16 + add r5, 32 + mov dstq, r5 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+64* 7] + mova m1, [cq+64*25] + mova m2, [cq+64*23] + mova m3, [cq+64* 9] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+64* 3] + mova m1, [cq+64*29] + mova m2, [cq+64*19] + mova m3, [cq+64*13] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 + mova m0, [cq+64* 2] + mova m1, [cq+64*14] + mova m2, [cq+64*18] + mova m3, [cq+64*30] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast + mova m0, [cq+64* 6] + mova m1, [cq+64*10] + mova m2, [cq+64*22] + mova m3, [cq+64*26] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast + mova m0, [cq+64* 4] + mova m1, [cq+64*12] + mova m2, [cq+64*20] + mova m3, [cq+64*28] + call m(idct_8x16_internal_10bpc).main_oddhalf_fast + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m2, [cq+64*16] + mova m3, [cq+64*24] + pxor m15, m15 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m15 + mova [cq+r7-64*1], m15 + mova [cq+r7+64*0], m15 + mova [cq+r7+64*1], m15 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m15, m11, 10 ; pd_2 +.main_end2: + add cq, 32 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + add r6, 32*8 + call m(idct_8x16_internal_10bpc).main_evenhalf + mova [r6+32*2], m1 + mova [r6+32*1], m2 + mova [r6+32*0], m3 + mova [r6-32*1], m4 + mova [r6-32*2], m5 + mova [r6-32*3], m6 + mova [r6-32*4], m7 + jmp .main_end_loop_start +.main_end_loop: + mova m0, [r6+32* 3] ; idct8 0 + n +.main_end_loop_start: + mova m1, [r5+32* 4] ; idct16 15 - n + mova m2, [r5-32*12] ; idct32 16 + n + mova m3, [r6-32*13] ; idct32 31 - n + mova m4, [r6-32*29] ; idct64 63 - n + mova m5, [r5-32*28] ; idct64 48 + n + mova m6, [r6-32*45] ; idct64 47 - n + mova m7, [r5-32*44] ; idct64 32 + n + paddd m8, m0, m1 ; idct16 out0 + n + psubd m0, m1 ; idct16 out15 - n + REPX {pmaxsd x, m12}, m8, m0 + REPX {pminsd x, m13}, m8, m0 + paddd m1, m8, m3 ; idct32 out0 + n + psubd m8, m3 ; idct32 out31 - n + paddd m3, m0, m2 ; idct32 out15 - n + psubd m0, m2 ; idct32 out16 + n + REPX {pmaxsd x, m12}, m1, m8, m3, m0 + REPX {pminsd x, m13}, m1, m3, m8, m0 + REPX {paddd x, m15}, m1, m3, m0, m8 + paddd m2, m1, m4 ; idct64 out0 + n (unshifted) + psubd m1, m4 ; idct64 out63 - n (unshifted) + paddd m4, m3, m5 ; idct64 out15 - n (unshifted) + psubd m3, m5 ; idct64 out48 + n (unshifted) + paddd m5, m0, m6 ; idct64 out16 + n (unshifted) + psubd m0, m6 ; idct64 out47 - n (unshifted) + paddd m6, m8, m7 ; idct64 out31 - n (unshifted) + psubd m8, m7 ; idct64 out32 + n (unshifted) + mova [r5-32*44], m2 + mova [r6+32* 3], m1 + mova [r6-32*45], m4 + mova [r5+32* 4], m3 + mova [r5-32*28], m5 + mova [r6-32*13], m0 + mova [r6-32*29], m6 + mova [r5-32*12], m8 + add r5, 32 + sub r6, 32 + cmp r5, r6 + jl .main_end_loop + ret +.shift_transpose: +%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift + sub r6, 32*48 + mov r5, r6 +%%loop: + mova m0, [r6-32* 4] + mova m4, [r6+32* 4] + mova m1, [r6-32* 3] + mova m5, [r6+32* 5] + mova m2, [r6-32* 2] + mova m6, [r6+32* 6] + mova m3, [r6-32* 1] + mova m7, [r6+32* 7] + REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m4, [r6+32* 0] + mova m6, [r6+32* 8] + mova m5, [r6+32* 1] + mova m7, [r6+32* 9] + REPX {psrad x, %1}, m4, m6, m5, m7 + packssdw m4, m6 + packssdw m5, m7 + mova m6, [r6+32* 2] + mova m8, [r6+32*10] + mova m7, [r6+32* 3] + mova m9, [r6+32*11] + REPX {psrad x, %1}, m6, m8, m7, m9 + packssdw m6, m8 + packssdw m7, m9 + call m(idct_16x8_internal_10bpc).transpose3 + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + add r6, 32*16 + add r5, 32*8 + cmp r5, r4 + jl %%loop + mov r6, r4 +%endmacro + IDCT64_SHIFT_TRANSPOSE 2 + ret + +cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r7, [r6-32*32] + lea r5, [r6+32*8] + lea r6, [pw_5+128] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq +.pass2_loop: + mova m0, [r7-32*99] + mova m1, [r7-32*97] + mova m2, [r7-32*95] + mova m3, [r7-32*93] + mova m4, [r7-32*67] + mova m5, [r7-32*65] + mova m6, [r7-32*63] + mova m7, [r7-32*61] + mova m8, [r7-32*35] + mova m9, [r7-32*33] + mova m10, [r7-32*31] + mova m11, [r7-32*29] + mova m12, [r7-32* 3] + mova m13, [r7-32* 1] + mova m14, [r7+32* 1] + mova m15, [r7+32* 3] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + mova m0, [r7-32*100] + mova m1, [r7-32*98] + mova m2, [r7-32*96] + mova m3, [r7-32*94] + mova m4, [r7-32*68] + mova m5, [r7-32*66] + mova m6, [r7-32*64] + mova m7, [r7-32*62] + mova m8, [r7-32*36] + mova m9, [r7-32*34] + mova m10, [r7-32*32] + mova m11, [r7-32*30] + mova m12, [r7-32* 4] + mova m13, [r7-32* 2] + mova m14, [r7+32* 0] + mova m15, [r7+32* 2] + add r7, 32*8 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128*31] + pmulld m2, m14, [cq+128*17] + pmulld m3, m14, [cq+128*15] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 7] + pmulld m1, m14, [cq+128*25] + pmulld m2, m14, [cq+128*23] + pmulld m3, m14, [cq+128* 9] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 5] + pmulld m1, m14, [cq+128*27] + pmulld m2, m14, [cq+128*21] + pmulld m3, m14, [cq+128*11] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128*29] + pmulld m2, m14, [cq+128*19] + pmulld m3, m14, [cq+128*13] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128*14] + pmulld m2, m14, [cq+128*18] + pmulld m3, m14, [cq+128*30] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2 + pmulld m0, m14, [cq+128* 6] + pmulld m1, m14, [cq+128*10] + pmulld m2, m14, [cq+128*22] + pmulld m3, m14, [cq+128*26] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2 + pmulld m0, m14, [cq+128* 4] + pmulld m1, m14, [cq+128*12] + pmulld m2, m14, [cq+128*20] + pmulld m3, m14, [cq+128*28] + call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 8] + pmulld m2, m14, [cq+128*16] + pmulld m3, m14, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + psrld m15, m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2 + IDCT64_SHIFT_TRANSPOSE 1 + ret + +cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r10, [r6-32*32] + lea r6, [pw_5+128] + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10-32*100] ; in0 + mova m1, [r10-32*96] ; in4 + mova m2, [r10-32*68] ; in8 + mova m3, [r10-32*64] ; in12 + mova m4, [r10-32*36] ; in16 + mova m5, [r10-32*32] ; in20 + mova m6, [r10-32* 4] ; in24 + mova m7, [r10+32* 0] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10-32*98] ; in2 + mova m1, [r10-32*94] ; in6 + mova m2, [r10-32*66] ; in10 + mova m3, [r10-32*62] ; in14 + mova m4, [r10-32*34] ; in18 + mova m5, [r10-32*30] ; in22 + mova m6, [r10-32* 2] ; in26 + mova m7, [r10+32* 2] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova m0, [r10-32*99] ; in1 + mova m1, [r10+32* 3] ; in31 + mova m2, [r10-32*35] ; in17 + mova m3, [r10-32*61] ; in15 + mova m4, [r10-32*67] ; in9 + mova m5, [r10-32*29] ; in23 + mova m6, [r10-32* 3] ; in25 + mova m7, [r10-32*93] ; in7 + lea r6, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + mova m0, [r10-32*95] ; in5 + mova m1, [r10-32* 1] ; in27 + mova m2, [r10-32*31] ; in21 + mova m3, [r10-32*65] ; in11 + mova m4, [r10-32*63] ; in13 + mova m5, [r10-32*33] ; in19 + mova m6, [r10+32* 1] ; in29 + mova m7, [r10-32*97] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 + add r10, 32*8 + sub dstq, r8 + sub r4, 32*44 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + mova m0, [cq+128* 1] + mova m1, [cq+128*31] + mova m2, [cq+128*17] + mova m3, [cq+128*15] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+128* 7] + mova m1, [cq+128*25] + mova m2, [cq+128*23] + mova m3, [cq+128* 9] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+128* 5] + mova m1, [cq+128*27] + mova m2, [cq+128*21] + mova m3, [cq+128*11] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128*29] + mova m2, [cq+128*19] + mova m3, [cq+128*13] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128*14] + mova m2, [cq+128*18] + mova m3, [cq+128*30] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast + mova m0, [cq+128* 6] + mova m1, [cq+128*10] + mova m2, [cq+128*22] + mova m3, [cq+128*26] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast + mova m0, [cq+128* 4] + mova m1, [cq+128*12] + mova m2, [cq+128*20] + mova m3, [cq+128*28] + call m(idct_8x16_internal_10bpc).main_oddhalf_fast + mova m0, [cq+128* 0] + mova m1, [cq+128* 8] + mova m2, [cq+128*16] + mova m3, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/itx16_avx512.asm b/third_party/dav1d/src/x86/itx16_avx512.asm new file mode 100644 index 0000000000..d973655462 --- /dev/null +++ b/third_party/dav1d/src/x86/itx16_avx512.asm @@ -0,0 +1,4133 @@ +; Copyright © 2022-2023, VideoLAN and dav1d authors +; Copyright © 2022-2023, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23 + db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31 + db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55 + db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63 +idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 + db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 + db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 + db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 +idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51 + db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59 + db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17 + db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25 +iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23 + db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31 + db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19 + db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27 +permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13 + db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29 + db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15 + db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31 +permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2 + db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6 + db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7 + db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3 +permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6 + db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14 + db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7 + db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15 +idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 + db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 + db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 + db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 +idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25 + db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57 + db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29 + db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61 +idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30 + db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62 + db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31 + db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63 + +pw_2048_m2048: times 16 dw 2048 +pw_m2048_2048: times 16 dw -2048 +pw_2048: times 16 dw 2048 + +; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++- +%macro COEF_PAIR 2-3 0 ; a, b, flags +%if %3 == 1 +pd_%1_m%2: dd %1, %1, -%2, -%2 +%define pd_%1 (pd_%1_m%2 + 4*0) +%define pd_m%2 (pd_%1_m%2 + 4*2) +%elif %3 == 2 +pd_m%1_%2: dd -%1, -%1, %2, %2 +%define pd_m%1 (pd_m%1_%2 + 4*0) +%define pd_%2 (pd_m%1_%2 + 4*2) +%else +pd_%1_%2: dd %1, %1, %2, %2 +%define pd_%1 (pd_%1_%2 + 4*0) +%define pd_%2 (pd_%1_%2 + 4*2) +%if %3 == 3 +%define pd_%2_m%2 pd_%2 +dd -%2, -%2 +%endif +%endif +%endmacro + +COEF_PAIR 201, 995 +COEF_PAIR 401, 1189, 1 +COEF_PAIR 401, 1931 +COEF_PAIR 401, 3920 +COEF_PAIR 799, 2276, 1 +COEF_PAIR 799, 3406 +COEF_PAIR 799, 4017 +COEF_PAIR 1380, 601 +COEF_PAIR 1751, 2440 +COEF_PAIR 2598, 1189 +COEF_PAIR 2598, 1931, 2 +COEF_PAIR 2598, 3612 +COEF_PAIR 2751, 2106 +COEF_PAIR 2896, 1567, 3 +COEF_PAIR 2896, 3784, 3 +COEF_PAIR 3035, 3513 +COEF_PAIR 3166, 1931 +COEF_PAIR 3166, 3612 +COEF_PAIR 3166, 3920 +COEF_PAIR 3703, 3290 +COEF_PAIR 3857, 4052 +COEF_PAIR 4017, 2276 +COEF_PAIR 4017, 3406 +COEF_PAIR 4076, 1189 +COEF_PAIR 4076, 3612 +COEF_PAIR 4076, 3920 +COEF_PAIR 4091, 3973 + +pb_32: times 4 db 32 +pw_5: times 2 dw 5 +pw_4096: times 2 dw 4096 +pw_8192: times 2 dw 8192 +pw_1697x16: times 2 dw 1697*16 +pw_2896x8: times 2 dw 2896*8 +pixel_10bpc_max: times 2 dw 0x03ff +dconly_10bpc: times 2 dw 0x7c00 +clip_18b_min: dd -0x20000 +clip_18b_max: dd 0x1ffff +pd_1: dd 1 +pd_2: dd 2 +pd_1448: dd 1448 +pd_2048: dd 2048 +pd_3071: dd 3071 ; 1024 + 2048 - 1 +pd_3072: dd 3072 ; 1024 + 2048 +pd_5119: dd 5119 ; 1024 + 4096 - 1 +pd_5120: dd 5120 ; 1024 + 4096 +pd_5793: dd 5793 + +cextern dup16_perm +cextern int8_permA +cextern idct_8x8_internal_8bpc_avx512icl.main +cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2 +cextern idct_8x16_internal_8bpc_avx512icl.main +cextern idct_8x16_internal_8bpc_avx512icl.main2 +cextern idct_8x16_internal_8bpc_avx512icl.main_fast +cextern idct_8x16_internal_8bpc_avx512icl.main_fast2 +cextern iadst_8x16_internal_8bpc_avx512icl.main2 +cextern idct_16x8_internal_8bpc_avx512icl.main +cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2 +cextern idct_16x16_internal_8bpc_avx512icl.main +cextern idct_16x16_internal_8bpc_avx512icl.main2 +cextern idct_16x16_internal_8bpc_avx512icl.main_fast +cextern idct_16x16_internal_8bpc_avx512icl.main_fast2 +cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2 +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2 +cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main +cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf +cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2 +cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf +cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2 +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast + +SECTION .text + +%define o_base (pw_2048+4*128) +%define o_base_8bpc (int8_permA+64*18) +%define o(x) (r5 - o_base + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +INIT_ZMM avx512icl + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 1 = inv_dst1, 2 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else +%if %8 < 4096 + vpbroadcastd m%3, [o(pd_%8)] +%else + vbroadcasti32x4 m%3, [o(pd_%8)] +%endif + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else +%if %7 < 4096 + vpbroadcastd m%5, [o(pd_%7)] +%else + vbroadcasti32x4 m%5, [o(pd_%7)] +%endif + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 2 + psubd m%4, m%6, m%4 + psubd m%2, m%4, m%2 +%else +%ifnum %6 + paddd m%4, m%6 +%endif + paddd m%2, m%4 +%endif +%ifnum %6 + paddd m%1, m%6 +%endif +%if %9 & 1 + psubd m%1, m%3, m%1 +%else + psubd m%1, m%3 +%endif +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size +cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_10bpc) + lea r5, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%4_internal_10bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else +%if %3 + add eobd, %3 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 8x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 +.dconly: + add r6d, 384 + sar r6d, 9 +.dconly2: + vpbroadcastd ym2, [o(dconly_10bpc)] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw ym1, r6d + paddsw ym1, ym2 +.dconly_loop: + mova xm0, [dstq+strideq*0] + vinserti32x4 ym0, [dstq+strideq*1], 1 + paddsw ym0, ym1 + psubusw ym0, ym2 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call .load + vpermi2q m1, m0, m2 ; 1 5 + vpermi2q m3, m6, m4 ; 7 3 + vpermt2q m0, m5, m4 ; 0 2 + vpermt2q m2, m5, m6 ; 4 6 + call .main + call .main_end + mova m4, [o(idct8x8p)] + packssdw m0, m2 ; 0 1 4 5 + packssdw m1, m3 ; 3 2 7 6 + vpermb m0, m4, m0 + vprolq m1, 32 + vpermb m2, m4, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + vextracti32x8 ym2, m0, 1 + vextracti32x8 ym3, m1, 1 + call m(idct_8x8_internal_8bpc).main + mova m10, [permC] + vpbroadcastd m12, [pw_2048] +.end: + vpermt2q m0, m10, m1 + vpermt2q m2, m10, m3 +.end2: + vpbroadcastd m11, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m10, m10 + pmulhrsw m8, m12, m0 + call .write_8x4_start + pmulhrsw m8, m12, m2 +.write_8x4: + lea dstq, [dstq+strideq*4] + add cq, 64*2 +.write_8x4_start: + mova xm9, [dstq+strideq*0] + vinserti32x4 ym9, [dstq+strideq*1], 1 + vinserti32x4 m9, [dstq+strideq*2], 2 + vinserti32x4 m9, [dstq+r6 ], 3 + mova [cq+64*0], m10 + mova [cq+64*1], m10 + paddw m9, m8 + pmaxsw m9, m10 + pminsw m9, m11 + mova [dstq+strideq*0], xm9 + vextracti32x4 [dstq+strideq*1], ym9, 1 + vextracti32x4 [dstq+strideq*2], m9, 2 + vextracti32x4 [dstq+r6 ], m9, 3 + ret +ALIGN function_align +.load: + mova m0, [cq+64*0] ; 0 1 + mova m4, [cq+64*1] ; 2 3 + mova m1, [o(permB)] + mova m2, [cq+64*2] ; 4 5 + mova m6, [cq+64*3] ; 6 7 + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + psrlq m5, m1, 32 + vpbroadcastd m12, [o(pd_2896)] + mova m3, m1 + vpbroadcastd m11, [o(pd_1)] + ret +ALIGN function_align +.main_fast: ; bottom half is zero + vbroadcasti32x4 m3, [o(pd_4017_3406)] + vbroadcasti32x4 m8, [o(pd_799_m2276)] + vbroadcasti32x4 m2, [o(pd_2896_3784)] + vbroadcasti32x4 m9, [o(pd_2896_1567)] + pmulld m3, m1 ; t4a t5a + pmulld m1, m8 ; t7a t6a + pmulld m2, m0 ; t0 t3 + pmulld m0, m9 ; t1 t2 + jmp .main2 +.main: + ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276 + ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784 +.main2: + REPX {paddd x, m13}, m1, m3, m0, m2 + REPX {psrad x, 12 }, m1, m3, m0, m2 + punpcklqdq m8, m1, m3 ; t4a t7a + punpckhqdq m1, m3 ; t5a t6a + psubd m3, m8, m1 ; t5a t6a + paddd m8, m1 ; t4 t7 + pmaxsd m3, m14 + punpckhqdq m1, m2, m0 ; t3 t2 + pminsd m3, m15 + punpcklqdq m2, m0 ; t0 t1 + pmulld m3, m12 + paddd m0, m2, m1 ; dct4 out0 out1 + psubd m2, m1 ; dct4 out3 out2 + REPX {pmaxsd x, m14}, m8, m0, m2 + REPX {pminsd x, m15}, m8, m0, m2 +.main3: + pshufd m1, m3, q1032 + paddd m3, m13 + psubd m9, m3, m1 + paddd m3, m1 + psrad m9, 12 + psrad m3, 12 + punpckhqdq m1, m8, m3 ; t7 t6 + shufpd m8, m9, 0xaa ; t4 t5 + ret +.main_end: + paddd m0, m11 + paddd m2, m11 + psubd m3, m0, m1 ; out7 out6 + paddd m0, m1 ; out0 out1 + paddd m1, m2, m8 ; out3 out2 + psubd m2, m8 ; out4 out5 + REPX {vpsravd x, m11}, m0, m2, m3, m1 + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity +INV_TXFM_8X8_FN adst, adst + +cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x8_internal_10bpc).load + vpermi2q m1, m6, m2 ; 7 5 + vpermi2q m3, m4, m0 ; 3 1 + vpermt2q m0, m5, m4 ; 0 2 + vpermt2q m2, m5, m6 ; 4 6 + call .main + punpckldq m1, m2, m4 ; out4 out6 + punpckhdq m2, m0 ; -out5 -out7 + punpckldq m0, m3 ; out0 out2 + punpckhdq m4, m3 ; -out1 -out3 + paddd m1, m11 + psubd m3, m11, m2 + paddd m0, m11 + psubd m4, m11, m4 +.pass1_end: + REPX {psrad x, 1}, m1, m0, m3, m4 + packssdw m0, m1 ; 0 2 4 6 + packssdw m4, m3 ; 1 3 5 7 + psrlq m1, [o(permB)], 8 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + psrlq m2, m1, 32 + vpermi2q m1, m0, m3 + vpermt2q m0, m2, m3 + jmp tx2q +.pass2: + call .main_pass2 + movu m10, [permC+2] + vbroadcasti32x8 m12, [pw_2048_m2048+16] + jmp m(idct_8x8_internal_10bpc).end +.main_pass2: + vextracti32x8 ym2, m0, 1 + vextracti32x8 ym3, m1, 1 + lea r5, [o_base_8bpc] + pshufd ym4, ym0, q1032 + pshufd ym5, ym1, q1032 + jmp m(iadst_8x8_internal_8bpc).main_pass2 +ALIGN function_align +.main: + ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612 + ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189 + psubd m4, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + REPX {pmaxsd x, m14}, m4, m2, m0, m1 + REPX {pminsd x, m15}, m4, m2, m0, m1 + pxor m5, m5 + psubd m5, m4 + shufpd m4, m2, 0xaa ; t4 t7 + shufpd m2, m5, 0xaa ; t5 -t6 + ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784 + punpckhqdq m3, m0, m1 + punpcklqdq m0, m1 + psubd m1, m0, m3 ; t2 t3 + paddd m0, m3 ; out0 -out7 + punpckhqdq m3, m4, m2 ; t7a t6a + punpcklqdq m4, m2 ; t5a t4a + psubd m2, m4, m3 ; t7 t6 + paddd m4, m3 ; out6 -out1 + REPX {pmaxsd x, m14}, m1, m2 + REPX {pminsd x, m15}, m1, m2 + shufpd m3, m1, m2, 0xaa + shufpd m1, m2, 0x55 + pmulld m3, m12 + pmulld m1, m12 + paddd m3, m13 + psubd m2, m3, m1 + paddd m3, m1 + psrad m2, 12 ; out4 -out5 + pshufd m3, m3, q1032 + psrad m3, 12 ; out2 -out3 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, identity +INV_TXFM_8X8_FN flipadst, flipadst + +cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x8_internal_10bpc).load + vpermi2q m1, m6, m2 ; 7 5 + vpermi2q m3, m4, m0 ; 3 1 + vpermt2q m0, m5, m4 ; 0 2 + vpermt2q m2, m5, m6 ; 4 6 + call m(iadst_8x8_internal_10bpc).main + punpckhdq m1, m3, m4 ; -out3 -out1 + punpckldq m3, m0 ; out2 out0 + punpckhdq m0, m2 ; -out7 -out5 + punpckldq m4, m2 ; out6 out4 + psubd m1, m11, m1 + paddd m3, m11 + psubd m0, m11, m0 + paddd m4, m11 + jmp m(iadst_8x8_internal_10bpc).pass1_end +.pass2: + call m(iadst_8x8_internal_10bpc).main_pass2 + movu m10, [permC+1] + vbroadcasti32x8 m12, [pw_m2048_2048+16] + lea r6, [strideq*3] + vpermt2q m0, m10, m1 ; 7 6 5 4 + vpbroadcastd m11, [pixel_10bpc_max] + vpermt2q m2, m10, m3 ; 3 2 1 0 + pxor m10, m10 + pmulhrsw m8, m12, m2 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m8, m12, m0 + jmp m(idct_8x8_internal_10bpc).write_8x4 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + mova m1, [cq+64*0] + packssdw m1, [cq+64*2] ; 0 4 1 5 + mova m2, [cq+64*1] ; 2 6 3 7 + packssdw m2, [cq+64*3] + mova m0, [o(idtx8x8p)] + vpermb m1, m0, m1 + vpermb m2, m0, m2 + punpckldq m0, m1, m2 ; 0 1 4 5 + punpckhdq m1, m2 ; 2 3 6 7 + jmp tx2q +.pass2: + movu m3, [o(permC+2)] + vpbroadcastd m12, [o(pw_4096)] + psrlq m2, m3, 32 + vpermi2q m2, m0, m1 + vpermt2q m0, m3, m1 + jmp m(idct_8x8_internal_10bpc).end2 + +%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 8x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, 35 +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, adst + +cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 43 + jl .fast + call .load + call .main + call .main_end +.pass1_end: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + jmp tx2q +.pass2: + mova m8, [o(idct8x16p)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3 + punpckhdq m5, m0, m1 + punpckldq m0, m1 + punpckhdq m4, m2, m3 + punpckldq m2, m3 + punpcklqdq m8, m0, m2 ; 15 1 + punpckhqdq m0, m2 ; 7 9 + punpckhqdq m1, m5, m4 ; 3 13 + punpcklqdq m5, m4 ; 11 5 + lea r5, [o_base_8bpc] + vextracti32x8 ym7, m8, 1 ; 14 2 + vextracti32x8 ym3, m0, 1 ; 6 10 + vextracti32x8 ym6, m1, 1 ; 12 4 + vextracti32x8 ym9, m5, 1 ; 8 0 + call m(idct_8x16_internal_8bpc).main2 + mova m8, [permC] + vpbroadcastd m12, [pw_2048] + vpermt2q m0, m8, m1 + lea r6, [strideq*3] + vpermt2q m2, m8, m3 + vpbroadcastd m11, [pixel_10bpc_max] + vpermt2q m4, m8, m5 + pxor m10, m10 + vpermt2q m6, m8, m7 + pmulhrsw m8, m12, m0 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m8, m12, m2 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m4 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m6 + jmp m(idct_8x8_internal_10bpc).write_8x4 +.fast: + mova ym0, [cq+64*0] + mova ym4, [cq+64*2] + mova ym1, [cq+64*1] + mova ym5, [cq+64*5] + mova ym2, [cq+64*4] + mova ym6, [cq+64*6] + mova ym3, [cq+64*7] + mova ym7, [cq+64*3] + call .round_input_fast + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_10bpc).main_end + movu m6, [o(permC+3)] + packssdw m3, m1, m3 + packssdw m1, m0, m2 + vprolq m3, 32 + vpermd m1, m6, m1 + vpermd m3, m6, m3 + mova ym0, ym1 ; 0 4 + vextracti32x8 ym1, m1, 1 ; 1 5 + mova ym2, ym3 ; 2 6 + vextracti32x8 ym3, m3, 1 ; 3 7 + jmp tx2q +ALIGN function_align +.round_input_fast: + movshdup m8, [o(permB)] + vpbroadcastd m12, [o(pd_2896)] + vpermt2q m0, m8, m4 + vpermt2q m1, m8, m5 + vpermt2q m2, m8, m6 + vpermt2q m3, m8, m7 + vpbroadcastd m13, [o(pd_2048)] + REPX {pmulld x, m12}, m0, m1, m2, m3 + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + REPX {paddd x, m13}, m0, m1, m2, m3 + vpbroadcastd m11, [o(pd_1)] + REPX {psrad x, 12 }, m0, m1, m2, m3 + ret +ALIGN function_align +.load: + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] +.load2: + vpbroadcastd m12, [o(pd_2896)] + pmulld m0, m12, [cq+64*0] + pmulld m1, m12, [cq+64*1] + pmulld m2, m12, [cq+64*2] + pmulld m3, m12, [cq+64*3] + vpbroadcastd m13, [o(pd_2048)] + pmulld m4, m12, [cq+64*4] + pmulld m5, m12, [cq+64*5] + pmulld m6, m12, [cq+64*6] + pmulld m7, m12, [cq+64*7] +.round: + REPX {paddd x, m13}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + REPX {paddd x, m13}, m4, m5, m6, m7 + REPX {psrad x, 12 }, m4, m5, m6, m7 + ret +ALIGN function_align +.main_fast_rect2: + REPX {paddd x, m13}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_fast: + pmulld m0, m12 + pmulld m5, m3, [o(pd_2276)] {1to16} ; t5a + pmulld m3, [o(pd_3406)] {1to16} ; t6a + pmulld m7, m1, [o(pd_4017)] {1to16} ; t7a + pmulld m1, [o(pd_799)] {1to16} ; t4a + pmulld m6, m2, [o(pd_3784)] {1to16} ; t3 + pmulld m2, [o(pd_1567)] {1to16} ; t2 + paddd m0, m13 + psubd m5, m13, m5 + psrad m0, 12 ; t0 + mova m9, m0 ; t1 + jmp .main2 +.main_rect2: + call .round +.main: + pmulld m0, m12 + ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 + pmulld m4, m12 + paddd m0, m13 + paddd m5, m13 + psubd m9, m0, m4 ; t1 + paddd m0, m4 ; t0 + psrad m9, 12 + psrad m0, 12 +.main2: + REPX {paddd x, m13}, m3, m1, m7 + REPX {psrad x, 12 }, m5, m1, m3, m7 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + psubd m5, m7, m3 ; t6a + paddd m7, m3 ; t7 + pmaxsd m5, m14 + pmaxsd m1, m14 + paddd m2, m13 + paddd m6, m13 + pminsd m5, m15 + pminsd m1, m15 + pmulld m5, m12 + pmulld m1, m12 + pmaxsd m8, m14 + pmaxsd m7, m14 + pminsd m8, m15 + paddd m5, m13 + psubd m4, m5, m1 + paddd m5, m1 + REPX {psrad x, 12 }, m2, m6, m5, m4 + paddd m1, m9, m2 ; dct4 out1 + psubd m2, m9, m2 ; dct4 out2 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + pminsd m6, m15, m7 + REPX {pmaxsd x, m14}, m0, m1, m2, m3 + REPX {pminsd x, m15}, m0, m1, m2, m3 + ret +.main_end: + vpbroadcastd m11, [o(pd_1)] +.main_end2: + REPX {paddd x, m11}, m0, m1, m2, m3 + psubd m7, m0, m6 ; out7 + paddd m0, m6 ; out0 + psubd m6, m1, m5 ; out6 + paddd m1, m5 ; out1 + psubd m5, m2, m4 ; out5 + paddd m2, m4 ; out2 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, identity, 35 +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, adst + +cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 43 + jl .fast + call m(idct_8x16_internal_10bpc).load + call .main + psrad m0, 1 + psrad m1, 1 + psrad m6, m10, 1 + psrad m7, m11, 1 + psrad m2, 12 + psrad m3, 12 + psrad m4, m8, 12 + psrad m5, m9, 12 + jmp m(idct_8x16_internal_10bpc).pass1_end +.fast: + call .fast_main + punpcklqdq m1, m2, m4 ; out4 out6 + punpckhqdq m2, m0 ; -out5 -out7 + punpcklqdq m0, m3 ; out0 out2 + punpckhqdq m4, m3 ; -out1 -out3 + paddd m1, m11 + psubd m3, m11, m2 + paddd m0, m11 + psubd m4, m11, m4 +.fast_end: + movu m5, [o(permC+3)] + REPX {psrad x, 1}, m1, m0, m3, m4 + packssdw m2, m0, m1 ; 0 2 4 6 + packssdw m3, m4, m3 ; 1 3 5 7 + vpermd m2, m5, m2 + vpermd m3, m5, m3 + mova ym0, ym2 + vextracti32x8 ym2, m2, 1 + mova ym1, ym3 + vextracti32x8 ym3, m3, 1 + jmp tx2q +.pass2: + call .pass2_main + movu m4, [permB+2] + vbroadcasti32x8 m12, [pw_2048_m2048+16] + psrlq m7, m4, 8 + vpermi2q m4, m0, m3 ; 0 1 2 3 + psrlq m5, m7, 24 + vpermi2q m7, m0, m3 ; 12 13 14 15 + psrlq m6, m5, 8 + vpermq m5, m5, m1 ; 4 5 6 7 + vpermq m6, m6, m2 ; 8 9 10 11 +.pass2_end: + vpbroadcastd m11, [pixel_10bpc_max] + pxor m10, m10 + lea r6, [strideq*3] + pmulhrsw m8, m12, m4 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m8, m12, m5 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m6 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m8, m12, m7 + jmp m(idct_8x8_internal_10bpc).write_8x4 +ALIGN function_align +.main: + ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a + psubd m8, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + psubd m4, m5, m1 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7 + REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7 + vpbroadcastd m10, [o(pd_1567)] + vpbroadcastd m11, [o(pd_3784)] + ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a + ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a + vpbroadcastd m12, [o(pd_1448)] + psubd m9, m6, m8 ; t7 + paddd m6, m8 ; out6 + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m2 ; t2 + paddd m0, m2 ; out0 + psubd m2, m1, m4 ; t6 + paddd m1, m4 ; -out1 + REPX {pmaxsd x, m14}, m5, m3, m2, m9 + REPX {pminsd x, m15}, m5, m3, m2, m9 + REPX {pmulld x, m12}, m5, m3, m2, m9 + vpbroadcastd m4, [o(pd_1)] + psubd m8, m5, m3 ; (t2 - t3) * 1448 + paddd m3, m5 ; (t2 + t3) * 1448 + psubd m5, m2, m9 ; (t6 - t7) * 1448 + paddd m2, m9 ; (t6 + t7) * 1448 + vpbroadcastd m9, [o(pd_3072)] + paddd m0, m4 + psubd m1, m4, m1 + paddd m10, m6, m4 + psubd m11, m4, m7 + paddd m2, m9 + paddd m8, m9 + vpbroadcastd m9, [o(pd_3071)] + psubd m3, m9, m3 + psubd m9, m5 + ret +ALIGN function_align +.fast_main: + mova ym0, [cq+64*0] + mova ym4, [cq+64*2] + mova ym1, [cq+64*7] + mova ym5, [cq+64*5] + mova ym2, [cq+64*4] + mova ym6, [cq+64*6] + mova ym3, [cq+64*3] + mova ym7, [cq+64*1] + call m(idct_8x16_internal_10bpc).round_input_fast + jmp m(iadst_8x8_internal_10bpc).main +ALIGN function_align +.pass2_main: + mova m8, [o(iadst8x16p)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3 + vpbroadcastd m10, [o(pw_2896x8)] + punpckhdq m5, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m2, m3 + punpckldq m2, m3 + lea r5, [o_base_8bpc] + punpckhqdq m4, m0, m2 ; 12 3 14 1 + punpcklqdq m0, m2 ; 0 15 2 13 + punpckhqdq m6, m5, m1 ; 8 7 10 5 + punpcklqdq m5, m1 ; 4 11 6 9 + call m(iadst_8x16_internal_8bpc).main2 + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m10 ; -out7 out4 out6 -out5 + pmulhrsw m2, m10 ; out8 -out11 -out9 out10 + ret + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, identity, 35 +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst + +cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 43 + jl .fast + call m(idct_8x16_internal_10bpc).load + call m(iadst_8x16_internal_10bpc).main + psrad m7, m0, 1 + psrad m0, m11, 1 + psrad m6, m1, 1 + psrad m1, m10, 1 + psrad m5, m2, 12 + psrad m2, m9, 12 + psrad m4, m3, 12 + psrad m3, m8, 12 + jmp m(idct_8x16_internal_10bpc).pass1_end +.fast: + call m(iadst_8x16_internal_10bpc).fast_main + punpckhqdq m1, m3, m4 ; -out3 -out1 + punpcklqdq m3, m0 ; out2 out0 + punpckhqdq m0, m2 ; -out7 -out5 + punpcklqdq m4, m2 ; out6 out4 + psubd m1, m11, m1 + paddd m3, m11 + psubd m0, m11, m0 + paddd m4, m11 + jmp m(iadst_8x16_internal_10bpc).fast_end +.pass2: + call m(iadst_8x16_internal_10bpc).pass2_main + movu m7, [permB+2] + vbroadcasti32x8 m12, [pw_m2048_2048+16] + psrlq m4, m7, 8 + vpermi2q m7, m3, m0 ; 3 2 1 0 + psrlq m5, m4, 24 + vpermi2q m4, m3, m0 ; 15 14 13 12 + psrlq m6, m5, 8 + vpermq m5, m5, m2 ; 11 10 9 8 + vpermq m6, m6, m1 ; 7 6 5 4 + jmp m(iadst_8x16_internal_10bpc).pass2_end + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x16_internal_10bpc).load2 + jmp m(idct_8x16_internal_10bpc).pass1_end +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + pmulhrsw m4, m8, m0 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + vpbroadcastd m7, [o(pw_2048)] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + vpbroadcastd m6, [o(pixel_10bpc_max)] + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m1 + punpckhdq m4, m1 + pxor m5, m5 + punpckhqdq m1, m0, m2 ; 1 5 9 13 + punpcklqdq m0, m2 ; 0 4 8 12 + punpcklqdq m2, m3, m4 ; 2 6 10 14 + punpckhqdq m3, m4 ; 3 7 11 15 + lea r6, [strideq*3] + pmulhrsw m0, m7 + call .write_8x4_start + pmulhrsw m0, m7, m1 + call .write_8x4 + pmulhrsw m0, m7, m2 + call .write_8x4 + pmulhrsw m0, m7, m3 +.write_8x4: + add dstq, strideq + add cq, 64*2 +.write_8x4_start: + mova xm4, [dstq+strideq*0] + vinserti32x4 ym4, [dstq+strideq*4], 1 + vinserti32x4 m4, [dstq+strideq*8], 2 + vinserti32x4 m4, [dstq+r6*4 ], 3 + mova [cq+64*0], m5 + mova [cq+64*1], m5 + paddw m4, m0 + pmaxsw m4, m5 + pminsw m4, m6 + mova [dstq+strideq*0], xm4 + vextracti32x4 [dstq+strideq*4], ym4, 1 + vextracti32x4 [dstq+strideq*8], m4, 2 + vextracti32x4 [dstq+r6*4 ], m4, 3 + ret + +%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 16x8 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 +.dconly: + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 +.dconly2: + vpbroadcastd m2, [o(dconly_10bpc)] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw m1, r6d + paddsw m1, m2 +.dconly_loop: + mova ym0, [dstq+strideq*0] + vinserti32x8 m0, [dstq+strideq*1], 1 + paddsw m0, m1 + psubusw m0, m2 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity, -21 +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, adst + +cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [o(pd_2896)] + pmulld m4, m12, [cq+64*0] ; 0 1 + pmulld m9, m12, [cq+64*1] ; 2 3 + pmulld m8, m12, [cq+64*2] ; 4 5 + pmulld m7, m12, [cq+64*3] ; 6 7 + vpbroadcastd m13, [o(pd_2048)] + pxor m2, m2 + mova m15, [o(permB)] + REPX {mova [cq+64*x], m2}, 0, 1, 2, 3 + psrlq m0, m15, 32 + REPX {paddd x, m13}, m4, m9, m8, m7 + vpbroadcastd m14, [o(clip_18b_min)] + REPX {psrad x, 12 }, m4, m8, m9, m7 + mova m1, m0 + vpermi2q m0, m4, m8 ; 0 4 + cmp eobd, 43 + jl .fast + pmulld m5, m12, [cq+64*4] ; 8 9 + pmulld m10, m12, [cq+64*5] ; 10 11 + pmulld m11, m12, [cq+64*6] ; 12 13 + pmulld m6, m12, [cq+64*7] ; 14 15 + REPX {mova [cq+64*x], m2}, 4, 5, 6, 7 + REPX {paddd x, m13}, m5, m10, m11, m6 + REPX {psrad x, 12 }, m10, m5, m11, m6 + mova m2, m1 + vpermi2q m1, m9, m10 ; 2 10 + mova m3, m2 + vpermi2q m2, m5, m11 ; 8 12 + vpermi2q m3, m6, m7 ; 14 6 + vpermt2q m4, m15, m11 ; 1 13 + vpermt2q m6, m15, m9 ; 15 3 + vpermt2q m5, m15, m8 ; 9 5 + vpermt2q m7, m15, m10 ; 7 11 + vpbroadcastd m15, [o(clip_18b_max)] + call m(idct_8x8_internal_10bpc).main + call .main + jmp .pass1_end +.fast: + vpermi2q m1, m9, m7 ; 2 6 + vpermt2q m4, m15, m9 ; 1 3 + vpermt2q m7, m15, m8 ; 7 5 + vpbroadcastd m15, [o(clip_18b_max)] + call m(idct_8x8_internal_10bpc).main_fast + call .main_fast +.pass1_end: + call m(idct_8x16_internal_10bpc).main_end + mova m8, [o(permA)] + psrlq m9, m8, 8 +.pass1_end2: + mova m10, m9 + mova m11, m8 + call .transpose_16x8 + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + call m(idct_16x8_internal_8bpc).main + movshdup m4, [permC] + vpbroadcastd m11, [pw_2048] + psrlq m5, m4, 8 +.end: + vpbroadcastd m13, [pixel_10bpc_max] + pxor m12, m12 + vpermq m8, m4, m0 + vpermq m9, m5, m1 + lea r6, [strideq*3] + call .write_16x4 + vpermq m8, m4, m2 + vpermq m9, m5, m3 +.write_16x4: + pmulhrsw m8, m11 + pmulhrsw m9, m11 +.write_16x4_noround: + mova ym10, [dstq+strideq*0] + vinserti32x8 m10, [dstq+strideq*1], 1 + paddw m8, m10 + mova ym10, [dstq+strideq*2] + vinserti32x8 m10, [dstq+r6 ], 1 + paddw m9, m10 + pmaxsw m8, m12 + pmaxsw m9, m12 + pminsw m8, m13 + pminsw m9, m13 + mova [dstq+strideq*0], ym8 + vextracti32x8 [dstq+strideq*1], m8, 1 + mova [dstq+strideq*2], ym9 + vextracti32x8 [dstq+r6 ], m9, 1 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.main_fast: ; bottom half is zero + vbroadcasti32x4 m6, [o(pd_4076_3920)] + vbroadcasti32x4 m3, [o(pd_401_m1189)] + vbroadcasti32x4 m5, [o(pd_m2598_1931)] + vbroadcasti32x4 m9, [o(pd_3166_3612)] + pmulld m6, m4 ; t15a t12a + pmulld m4, m3 ; t8a t11a + pmulld m5, m7 ; t9a t10a + pmulld m7, m9 ; t14a t13a + jmp .main2 +.main: + ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189 + ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612 +.main2: + REPX {paddd x, m13}, m4, m6, m5, m7 + REPX {psrad x, 12 }, m4, m5, m6, m7 + paddd m9, m4, m5 ; t8 t11 + psubd m4, m5 ; t9 t10 + psubd m5, m6, m7 ; t14 t13 + paddd m6, m7 ; t15 t12 + REPX {pmaxsd x, m14}, m5, m4, m9, m6 + REPX {pminsd x, m15}, m5, m4, m9, m6 +.main3: + psubd m3, m0, m1 ; dct8 out7 out6 + paddd m0, m1 ; dct8 out0 out1 + vbroadcasti32x4 m7, [o(pd_3784_m3784)] + pmulld m7, m5 + vpmulld m5, [o(pd_1567)] {1to16} + paddd m1, m2, m8 ; dct8 out3 out2 + psubd m2, m8 ; dct8 out4 out5 + vbroadcasti32x4 m8, [o(pd_1567_m1567)] + pmulld m8, m4 + vpmulld m4, [o(pd_3784)] {1to16} + REPX {pmaxsd x, m14}, m0, m1 + REPX {pminsd x, m15}, m0, m1 + paddd m7, m13 + paddd m5, m13 + paddd m7, m8 + psubd m5, m4 + psrad m7, 12 ; t14a t10a + psrad m5, 12 ; t9a t13a + punpckhqdq m4, m9, m7 + punpcklqdq m8, m9, m5 + punpckhqdq m5, m6, m5 + punpcklqdq m6, m7 + psubd m7, m8, m4 ; t11a t10 + paddd m8, m4 ; t8a t9 + psubd m4, m6, m5 ; t12a t13 + paddd m6, m5 ; t15a t14 + REPX {pmaxsd x, m14}, m4, m7 + REPX {pminsd x, m15}, m4, m7 + pmulld m4, m12 + pmulld m7, m12 + REPX {pmaxsd x, m14}, m2, m3, m6, m8 + REPX {pminsd x, m15}, m2, m3, m6, m8 + paddd m4, m13 + paddd m5, m4, m7 + psubd m4, m7 + psrad m4, 12 ; t11 t10a + psrad m5, 12 ; t12 t13a + ret +ALIGN function_align +.transpose_16x8: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpermi2d m8, m0, m2 + vpermt2d m0, m9, m2 + vpermi2d m10, m1, m3 + vpermi2d m11, m1, m3 + punpckhwd m3, m8, m0 + punpcklwd m1, m8, m0 + punpckhwd m4, m10, m11 + punpcklwd m2, m10, m11 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, identity, -21 +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, adst + +cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + call .main_pass1 + vpbroadcastd m9, [o(pd_1)] + paddd m0, m9 + psubd m1, m9, m1 + paddd m2, m9 + psubd m3, m9, m3 + paddd m4, m9, m5 + psubd m5, m9, m6 + paddd m6, m9, m7 + psubd m7, m9, m8 +.pass1_end: + mova m9, [o(permA)] + psrlq m8, m9, 8 + REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7 + jmp m(idct_16x8_internal_10bpc).pass1_end2 +.pass2: + call .main_pass2 + vpermq m8, m11, m0 + vpermq m9, m11, m1 + call m(idct_16x8_internal_10bpc).write_16x4_noround + vpermq m8, m11, m2 + vpermq m9, m11, m3 + jmp m(idct_16x8_internal_10bpc).write_16x4_noround +ALIGN function_align +.main_pass1: + vpbroadcastd m12, [o(pd_2896)] + pmulld m2, m12, [cq+64*0] + pmulld m7, m12, [cq+64*1] + pmulld m1, m12, [cq+64*2] + pmulld m5, m12, [cq+64*3] + vpbroadcastd m13, [o(pd_2048)] + pxor m4, m4 + mova m10, [o(permB)] + REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 + REPX {paddd x, m13}, m2, m7, m1, m5 + psrlq m6, m10, 32 + REPX {psrad x, 12 }, m2, m7, m1, m5 + mova m0, m6 + vpermi2q m0, m2, m7 ; 0 2 + vpermt2q m7, m10, m2 ; 3 1 + mova m2, m6 + vpermi2q m2, m1, m5 ; 4 6 + vpermt2q m5, m10, m1 ; 7 5 + cmp eobd, 43 + jl .main_fast + pmulld m8, m12, [cq+64*4] + pmulld m3, m12, [cq+64*5] + pmulld m9, m12, [cq+64*6] + pmulld m1, m12, [cq+64*7] + REPX {mova [cq+64*x], m4}, 4, 5, 6, 7 + REPX {paddd x, m13}, m8, m3, m9, m1 + REPX {psrad x, 12 }, m8, m3, m9, m1 + mova m4, m6 + vpermi2q m4, m8, m3 ; 8 10 + vpermt2q m3, m10, m8 ; 11 9 + vpermi2q m6, m9, m1 ; 12 14 + vpermt2q m1, m10, m9 ; 15 13 +.main: + ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1 + ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1 + ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106 + ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601 + jmp .main2 +.main_fast: + vbroadcasti32x4 m1, [o(pd_4091_3973)] + vbroadcasti32x4 m8, [o(pd_201_995)] + vbroadcasti32x4 m3, [o(pd_3703_3290)] + vbroadcasti32x4 m9, [o(pd_1751_2440)] + vbroadcasti32x4 m4, [o(pd_2751_2106)] + vbroadcasti32x4 m10, [o(pd_3035_3513)] + vbroadcasti32x4 m6, [o(pd_1380_601)] + vbroadcasti32x4 m11, [o(pd_3857_4052)] + pmulld m1, m0 + pmulld m0, m8 + pmulld m3, m2 + pmulld m2, m9 + pmulld m4, m5 + pmulld m5, m10 + pmulld m6, m7 + pmulld m7, m11 +.main2: + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + REPX {psubd x, m13, x}, m1, m3 + REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7 + psubd m8, m0, m4 ; t8a t10a + paddd m0, m4 ; t0a t2a + psubd m4, m1, m5 ; t9a t11a + paddd m1, m5 ; t1a t3a + psubd m5, m2, m6 ; t12a t14a + paddd m2, m6 ; t4a t6a + psubd m6, m3, m7 ; t13a t15a + paddd m3, m7 ; t5a t7a + REPX {pmaxsd x, m14}, m8, m4, m5, m6 + REPX {pminsd x, m15}, m8, m4, m5, m6 + vbroadcasti32x4 m11, [o(pd_4017_2276)] + vbroadcasti32x4 m10, [o(pd_799_3406)] + ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11 + ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10 + REPX {pmaxsd x, m14}, m0, m2, m1, m3 + REPX {pminsd x, m15}, m0, m2, m1, m3 + psubd m7, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + psubd m3, m4, m6 ; t12a t14a + paddd m4, m6 ; t8a t10a + psubd m6, m8, m5 ; t13a t15a + paddd m8, m5 ; t9a t11a + REPX {pmaxsd x, m14}, m7, m3, m2, m6 + REPX {pminsd x, m15}, m7, m3, m2, m6 + punpcklqdq m5, m3, m7 ; t12a t4 + punpckhqdq m3, m7 ; t14a t6 + punpckhqdq m7, m6, m2 ; t15a t7 + punpcklqdq m6, m2 ; t13a t5 + vpbroadcastd m11, [o(pd_1567)] + vpbroadcastd m10, [o(pd_3784)] + ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11 + ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10 + REPX {pmaxsd x, m14}, m0, m4, m1, m8 + REPX {pminsd x, m15}, m0, m4, m1, m8 + punpckhqdq m2, m4, m0 ; t10a t2 + punpcklqdq m4, m0 ; t8a t0 + punpckhqdq m0, m8, m1 ; t11a t3 + punpcklqdq m8, m1 ; t9a t1 + paddd m1, m6, m7 ; out2 -out3 + psubd m6, m7 ; t14a t6 + paddd m7, m5, m3 ; -out13 out12 + psubd m5, m3 ; t15a t7 + psubd m3, m8, m0 ; t11 t3a + paddd m8, m0 ; out14 -out15 + paddd m0, m4, m2 ; -out1 out0 + psubd m4, m2 ; t10 t2a + REPX {pmaxsd x, m14}, m6, m5, m3, m4 + mov r6d, 0x3333 + REPX {pminsd x, m15}, m6, m5, m3, m4 + kmovw k1, r6d + REPX {pmulld x, m12}, m6, m5, m3, m4 + pxor m9, m9 + REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8 + paddd m6, m13 + paddd m4, m13 + paddd m2, m6, m5 ; -out5 out4 + psubd m6, m5 ; out10 -out11 + psubd m5, m4, m3 ; -out9 out8 + paddd m3, m4 ; out6 -out7 + REPX {psrad x, 12}, m2, m3, m5, m6 + REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6 + ret +ALIGN function_align +.main_pass2: + lea r5, [o_base_8bpc] + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_16x8_internal_8bpc).main_pass2 + movshdup m11, [permC] + pmulhrsw m0, m6 + pmulhrsw m1, m6 + vpbroadcastd m13, [pixel_10bpc_max] + pxor m12, m12 + lea r6, [strideq*3] + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, identity, -21 +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst + +cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(iadst_16x8_internal_10bpc).main_pass1 + vpbroadcastd m9, [o(pd_1)] + psubd m4, m9, m3 + paddd m3, m9, m5 + paddd m5, m9, m2 + psubd m2, m9, m6 + psubd m6, m9, m1 + paddd m1, m9, m7 + paddd m7, m9, m0 + psubd m0, m9, m8 + jmp m(iadst_16x8_internal_10bpc).pass1_end +.pass2: + call m(iadst_16x8_internal_10bpc).main_pass2 + psrlq m11, 8 + vpermq m8, m11, m3 + vpermq m9, m11, m2 + call m(idct_16x8_internal_10bpc).write_16x4_noround + vpermq m8, m11, m1 + vpermq m9, m11, m0 + jmp m(idct_16x8_internal_10bpc).write_16x4_noround + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 + call m(idct_8x16_internal_10bpc).load2 + vpbroadcastd m8, [o(pd_5793)] + vpbroadcastd m13, [o(pd_3072)] + pxor m10, m10 + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(idct_8x16_internal_10bpc).round + psrlq m8, [o(permA)], 16 + psrlq m9, m8, 8 + mova m10, m8 + mova m11, m9 + call m(idct_16x8_internal_10bpc).transpose_16x8 + jmp tx2q +.pass2: + movshdup m4, [o(permC)] + vpbroadcastd m11, [o(pw_4096)] + mova m5, m4 + jmp m(idct_16x8_internal_10bpc).end + +%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 16x16 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity, 28 +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, adst + +cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + cmp eobd, 36 + jl .fast + mova m0, [cq+64* 0] + mova m1, [cq+64* 2] + mova m2, [cq+64* 4] + mova m3, [cq+64* 6] + mova m4, [cq+64* 8] + mova m5, [cq+64*10] + mova m6, [cq+64*12] + mova m7, [cq+64*14] +%if WIN64 + movaps [cq+16*0], xmm6 + movaps [cq+16*1], xmm7 +%endif + call m(idct_8x16_internal_10bpc).main + mova m16, [cq+64* 1] + mova m17, [cq+64* 3] + mova m18, [cq+64* 5] + mova m19, [cq+64* 7] + mova m20, [cq+64* 9] + mova m21, [cq+64*11] + mova m22, [cq+64*13] + mova m23, [cq+64*15] + call .main + call .main_end +.pass1_end: +%if WIN64 + movaps xmm6, [cq+16*0] + movaps xmm7, [cq+16*1] +%endif + vzeroupper +.pass1_end2: + call .main_end3 +.pass1_end3: + mov r6d, 64*12 + pxor m8, m8 +.zero_loop: + mova [cq+r6+64*3], m8 + mova [cq+r6+64*2], m8 + mova [cq+r6+64*1], m8 + mova [cq+r6+64*0], m8 + sub r6d, 64*4 + jge .zero_loop + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + call m(idct_16x16_internal_8bpc).main + movshdup m12, [permC] + vpbroadcastd m11, [pw_2048] + psrlq m13, m12, 8 + vpermq m8, m12, m0 + vpermq m0, m13, m7 + vpermq m7, m13, m1 + vpermq m1, m12, m6 + vpermq m6, m12, m2 + vpermq m2, m13, m5 + vpermq m5, m13, m3 + vpermq m3, m12, m4 +.pass2_end: + lea r6, [strideq*3] + vpbroadcastd m13, [pixel_10bpc_max] + pxor m12, m12 + pmulhrsw m8, m11, m8 + pmulhrsw m9, m11, m7 + call m(idct_16x8_internal_10bpc).write_16x4_noround + pmulhrsw m8, m11, m6 + pmulhrsw m9, m11, m5 + call m(idct_16x8_internal_10bpc).write_16x4_noround + pmulhrsw m8, m11, m3 + pmulhrsw m9, m11, m2 + call m(idct_16x8_internal_10bpc).write_16x4_noround + pmulhrsw m8, m11, m1 + pmulhrsw m9, m11, m0 + jmp m(idct_16x8_internal_10bpc).write_16x4_noround +.fast: + mova ym0, [cq+64*0] + mova ym2, [cq+64*4] + movshdup m8, [o(permB)] + mova ym1, [cq+64*2] + mova ym3, [cq+64*6] + mova ym4, [cq+64*1] + mova ym5, [cq+64*3] + mova ym6, [cq+64*5] + mova ym7, [cq+64*7] + vpermt2q m0, m8, m2 ; 0 4 + vpermt2q m1, m8, m3 ; 2 6 + vpermt2q m4, m8, m5 ; 1 3 + vpermt2q m7, m8, m6 ; 7 5 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + vpbroadcastd m11, [o(pd_2)] + call m(idct_8x16_internal_10bpc).main_end2 + mova m8, [o(permA)] + psrlq m9, m8, 8 + jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2 +ALIGN function_align +.main_fast_rect2: + REPX {paddd x, m13}, m16, m17, m18, m19 + REPX {psrad x, 12 }, m16, m17, m18, m19 +.main_fast: + pmulld m23, m16, [o(pd_4076)] {1to16} ; t15a + pmulld m16, [o(pd_401)] {1to16} ; t8a + pmulld m20, m19, [o(pd_2598)] {1to16} ; t9a + pmulld m19, [o(pd_3166)] {1to16} ; t14a + pmulld m22, m17, [o(pd_1189)] {1to16} ; t11a + pmulld m17, [o(pd_3920)] {1to16} ; t12a + pmulld m21, m18, [o(pd_3612)] {1to16} ; t13a + pmulld m18, [o(pd_1931)] {1to16} ; t10a + psubd m20, m13, m20 + psubd m22, m13, m22 + call .round2 + jmp .main2 +.main_rect2: + call .round +.main: + ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a + ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a + call .round +.main2: + paddd m9, m20, m16 ; t8 + psubd m20, m16, m20 ; t9 + psubd m16, m22, m18 ; t10 + paddd m18, m22 ; t11 + paddd m22, m23, m19 ; t15 + psubd m23, m19 ; t14 + psubd m19, m17, m21 ; t13 + paddd m17, m21 ; t12 + vpbroadcastd m11, [o(pd_3784)] + REPX {pmaxsd x, m14}, m20, m23, m16, m19 + vpbroadcastd m10, [o(pd_1567)] + REPX {pminsd x, m15}, m20, m23, m16, m19 + ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11 + ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2 + REPX {pmaxsd x, m14}, m9, m18, m22, m17 + REPX {pminsd x, m15}, m9, m18, m22, m17 + paddd m21, m20, m19 ; t14 + psubd m20, m19 ; t13 + psubd m19, m9, m18 ; t11a + paddd m9, m18 ; t8a + psubd m18, m23, m16 ; t10 + paddd m16, m23 ; t9 + psubd m23, m22, m17 ; t12a + paddd m22, m17 ; t15a + REPX {pmaxsd x, m14}, m20, m23, m18, m19 + REPX {pminsd x, m15}, m20, m23, m18, m19 + REPX {pmulld x, m12}, m20, m23, m18, m19 + psubd m7, m0, m6 ; dct8 out7 + paddd m0, m6 ; dct8 out0 + psubd m6, m1, m5 ; dct8 out6 + paddd m1, m5 ; dct8 out1 + REPX {pmaxsd x, m14}, m7, m0, m6, m1 + psubd m5, m2, m4 ; dct8 out5 + paddd m2, m4 ; dct8 out2 + REPX {pminsd x, m15}, m7, m0, m6, m1 + psubd m4, m3, m8 ; dct8 out4 + paddd m3, m8 ; dct8 out3 + REPX {pmaxsd x, m14}, m5, m2, m4, m3 + paddd m20, m13 + paddd m23, m13 + REPX {pminsd x, m15}, m5, m2, m4, m3 + psubd m17, m20, m18 ; t10a + paddd m20, m18 ; t13a + REPX {pmaxsd x, m14}, m22, m21, m16, m9 + psubd m18, m23, m19 ; t11 + paddd m19, m23 ; t12 + REPX {pminsd x, m15}, m22, m21, m16, m9 + REPX {psrad x, 12 }, m20, m19, m18, m17 + ret +.main_end: + vpbroadcastd m11, [o(pd_2)] +.main_end2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m23, m0, m22 ; out15 + paddd m0, m22 ; out0 + psubd m22, m1, m21 ; out14 + paddd m1, m21 ; out1 + psubd m21, m2, m20 ; out13 + paddd m2, m20 ; out2 + psubd m20, m3, m19 ; out12 + paddd m3, m19 ; out3 + psubd m19, m4, m18 ; out11 + paddd m4, m18 ; out4 + psubd m18, m5, m17 ; out10 + paddd m5, m17 ; out5 + psubd m17, m6, m16 ; out9 + paddd m6, m16 ; out6 + psubd m16, m7, m9 ; out8 + paddd m7, m9 ; out7 + REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \ + m4, m20, m5, m21, m6, m22, m7, m23 + packssdw m0, m16 + packssdw m1, m17 + packssdw m2, m18 + packssdw m3, m19 + packssdw m4, m20 + packssdw m5, m21 + packssdw m6, m22 + packssdw m7, m23 + ret +.main_end3: + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 + punpckhdq m7, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m3, m6 + punpckldq m3, m6 + vshufi32x4 m6, m0, m4, q3232 + vinserti32x8 m0, ym4, 1 + vinserti32x8 m4, m8, ym3, 1 + vshufi32x4 m8, m3, q3232 + vinserti32x8 m3, m7, ym1, 1 + vshufi32x4 m7, m1, q3232 + vshufi32x4 m1, m2, m5, q3232 + vinserti32x8 m2, ym5, 1 + vshufi32x4 m5, m7, m1, q2020 ; 10 11 + vshufi32x4 m7, m1, q3131 ; 14 15 + vshufi32x4 m1, m3, m2, q2020 ; 2 3 + vshufi32x4 m3, m2, q3131 ; 6 7 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 + ret +ALIGN function_align +.round: + paddd m20, m13 + paddd m22, m13 +.round2: + paddd m16, m13 + paddd m18, m13 +.round3: + REPX {psrad x, 12 }, m16, m18, m20, m22 + REPX {paddd x, m13}, m17, m19, m21, m23 + REPX {psrad x, 12 }, m17, m19, m21, m23 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, flipadst +INV_TXFM_16X16_FN adst, adst + +cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 36 + jl .fast + call .main_pass1 + packssdw m0, m16 + packssdw m1, m17 + packssdw m2, m18 + packssdw m3, m19 + packssdw m4, m5, m20 + packssdw m5, m6, m21 + packssdw m6, m7, m22 + packssdw m7, m8, m23 + jmp m(idct_16x16_internal_10bpc).pass1_end +.fast: + call .main_pass1_fast + vpbroadcastd m9, [o(pd_2)] + paddd m0, m9 + psubd m1, m9, m1 + paddd m2, m9 + psubd m3, m9, m3 + paddd m4, m9, m5 + psubd m5, m9, m6 + paddd m6, m9, m7 + psubd m7, m9, m8 +.pass1_fast_end: + mova m9, [o(permA)] + psrlq m8, m9, 8 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 +.pass1_fast_end2: + mova m10, m9 + mova m11, m8 + call m(idct_16x8_internal_10bpc).transpose_16x8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp tx2q +.pass2: + lea r5, [o_base_8bpc] + call m(iadst_16x16_internal_8bpc).main_pass2b + movshdup m12, [permC] + mova m11, [pw_2048_m2048] + psrlq m13, m12, 8 + vpermq m8, m13, m0 + vpermq m0, m12, m7 + vpermq m7, m13, m1 + vpermq m1, m12, m6 + vpermq m6, m13, m2 + vpermq m2, m12, m5 + vpermq m5, m13, m3 + vpermq m3, m12, m4 + jmp m(idct_16x16_internal_10bpc).pass2_end +ALIGN function_align +.main_pass1: + mova m0, [cq+64* 0] +%if WIN64 + movaps [cq+16*0], xmm6 + movaps [cq+16*1], xmm7 +%endif + mova m23, [cq+64*15] + vpbroadcastd m13, [o(pd_2048)] + ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0 + mova m7, [cq+64* 7] + mova m16, [cq+64* 8] + ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8 + mova m2, [cq+64* 2] + mova m21, [cq+64*13] + ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2 + mova m5, [cq+64* 5] + mova m18, [cq+64*10] + ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10 + mova m4, [cq+64* 4] + mova m19, [cq+64*11] + ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4 + mova m3, [cq+64* 3] + mova m20, [cq+64*12] + ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12 + mova m6, [cq+64* 6] + mova m17, [cq+64* 9] + ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6 + mova m1, [cq+64* 1] + mova m22, [cq+64*14] + ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14 + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + psubd m9, m23, m7 ; t9a + paddd m23, m7 ; t1a + psubd m7, m2, m18 ; t10a + paddd m18, m2 ; t2a + REPX {pmaxsd x, m14}, m9, m23, m7, m18 + psubd m2, m17, m1 ; t15a + paddd m17, m1 ; t7a + REPX {pminsd x, m15}, m9, m23, m7, m18 + psubd m1, m21, m5 ; t11a + paddd m21, m5 ; t3a + REPX {pmaxsd x, m14}, m2, m17, m1, m21 + psubd m5, m4, m20 ; t12a + paddd m4, m20 ; t4a + REPX {pminsd x, m15}, m2, m17, m1, m21 + psubd m20, m19, m3 ; t13a + paddd m19, m3 ; t5a + REPX {pmaxsd x, m14}, m5, m4, m20, m19 + psubd m8, m6, m22 ; t14a + paddd m6, m22 ; t6a + REPX {pminsd x, m15}, m5, m4, m20, m19 + psubd m22, m0, m16 ; t8a + paddd m16, m0 ; t0a + REPX {pmaxsd x, m14}, m8, m6, m22, m16 + vpbroadcastd m11, [o(pd_4017)] + vpbroadcastd m10, [o(pd_799)] + REPX {pminsd x, m15}, m8, m6, m22, m16 + ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8 + ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13 + vpbroadcastd m11, [o(pd_2276)] + vpbroadcastd m10, [o(pd_3406)] + ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10 + ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15 + paddd m0, m16, m4 ; t0 + psubd m16, m4 ; t4 + psubd m3, m23, m19 ; t5 + paddd m23, m19 ; t1 + REPX {pmaxsd x, m14}, m0, m16, m3, m23 + psubd m19, m18, m6 ; t6 + paddd m18, m6 ; t2 + REPX {pminsd x, m15}, m0, m16, m3, m23 + psubd m6, m21, m17 ; t7 + paddd m21, m17 ; t3 + REPX {pmaxsd x, m14}, m19, m18, m6, m21 + paddd m17, m9, m20 ; t8a + psubd m9, m20 ; t12a + REPX {pminsd x, m15}, m19, m18, m6, m21 + psubd m20, m22, m5 ; t13a + paddd m22, m5 ; t9a + REPX {pmaxsd x, m14}, m17, m9, m20, m22 + psubd m5, m1, m2 ; t14a + paddd m1, m2 ; t10a + REPX {pminsd x, m15}, m17, m9, m20, m22 + psubd m2, m7, m8 ; t15a + paddd m7, m8 ; t11a + REPX {pmaxsd x, m14}, m5, m1, m2, m7 + vpbroadcastd m11, [o(pd_3784)] + vpbroadcastd m10, [o(pd_1567)] + REPX {pminsd x, m15}, m5, m1, m2, m7 + ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a + ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a + ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12 + ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15 + psubd m8, m0, m18 ; t2a + paddd m0, m18 ; out0 + psubd m18, m23, m21 ; t3a + paddd m23, m21 ; -out15 + paddd m21, m9, m5 ; -out13 + psubd m9, m5 ; t15a + psubd m5, m3, m6 ; t6 + paddd m3, m6 ; -out3 + REPX {pmaxsd x, m14}, m8, m18, m9, m5 + psubd m6, m20, m2 ; t14a + paddd m2, m20 ; out2 + paddd m20, m16, m19 ; out12 + psubd m16, m19 ; t7 + REPX {pminsd x, m15}, m8, m18, m9, m5 + psubd m19, m22, m7 ; t11 + paddd m22, m7 ; out14 + psubd m7, m17, m1 ; t10 + paddd m1, m17 ; -out1 + REPX {pmaxsd x, m14}, m6, m16, m19, m7 + vpbroadcastd m12, [o(pd_1448)] + vpbroadcastd m4, [o(pd_2)] + vpbroadcastd m10, [o(pd_5120)] + vpbroadcastd m11, [o(pd_5119)] + REPX {pminsd x, m15}, m6, m16, m19, m7 + psubd m17, m7, m19 ; -out9 + paddd m7, m19 ; out6 + psubd m19, m5, m16 ; -out11 + paddd m5, m16 ; out4 + REPX {pmulld x, m12}, m17, m7, m19, m5 + psubd m16, m8, m18 ; out8 + paddd m8, m18 ; -out7 + psubd m18, m6, m9 ; out10 + paddd m6, m9 ; -out5 + REPX {pmulld x, m12}, m16, m8, m18, m6 + REPX {paddd x, m4 }, m0, m2, m20, m22 + REPX {psubd x, m4, x}, m1, m3, m21, m23 + REPX {paddd x, m10 }, m7, m5, m16, m18 + REPX {psubd x, m11, x}, m17, m19, m8, m6 + REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3 + REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8 + ret +ALIGN function_align +.main_pass1_fast: + mova ym0, [cq+64*0] + mova ym1, [cq+64*2] + movshdup m8, [o(permB)] + mova ym6, [cq+64*1] + mova ym7, [cq+64*3] + mova ym2, [cq+64*4] + mova ym3, [cq+64*6] + mova ym4, [cq+64*5] + mova ym5, [cq+64*7] + vpermt2q m0, m8, m1 ; 0 2 + vpermt2q m7, m8, m6 ; 3 1 + vpermt2q m2, m8, m3 ; 4 6 + vpermt2q m5, m8, m4 ; 7 5 + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m12, [o(pd_2896)] + jmp m(iadst_16x8_internal_10bpc).main_fast + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + cmp eobd, 36 + jl .fast + call m(iadst_16x16_internal_10bpc).main_pass1 + packssdw m4, m19, m3 + packssdw m3, m20, m5 + packssdw m5, m18, m2 + packssdw m2, m21, m6 + packssdw m6, m17, m1 + packssdw m1, m22, m7 + packssdw m7, m16, m0 + packssdw m0, m23, m8 + jmp m(idct_16x16_internal_10bpc).pass1_end +.fast: + call m(iadst_16x16_internal_10bpc).main_pass1_fast + vpbroadcastd m9, [o(pd_2)] + psubd m4, m9, m3 + paddd m3, m9, m5 + paddd m5, m9, m2 + psubd m2, m9, m6 + psubd m6, m9, m1 + paddd m1, m9, m7 + paddd m7, m9, m0 + psubd m0, m9, m8 + jmp m(iadst_16x16_internal_10bpc).pass1_fast_end +.pass2: + lea r5, [o_base_8bpc] + call m(iadst_16x16_internal_8bpc).main_pass2b + movshdup m12, [permC] + movu m11, [pw_m2048_2048] + psrlq m13, m12, 8 + vpermq m8, m13, m7 + vpermq m7, m13, m6 + vpermq m6, m13, m5 + vpermq m5, m13, m4 + vpermq m3, m12, m3 + vpermq m2, m12, m2 + vpermq m1, m12, m1 + vpermq m0, m12, m0 + jmp m(idct_16x16_internal_10bpc).pass2_end + +INV_TXFM_16X16_FN identity, dct, -92 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m10, [o(pd_5793)] + vpbroadcastd m11, [o(pd_5120)] + mov r6, cq + cmp eobd, 36 + jl .fast + call .pass1_main + packssdw m0, m6, m8 + packssdw m1, m7, m9 + call .pass1_main + packssdw m2, m6, m8 + packssdw m3, m7, m9 + call .pass1_main + packssdw m4, m6, m8 + packssdw m5, m7, m9 + call .pass1_main + packssdw m6, m8 + packssdw m7, m9 + jmp m(idct_16x16_internal_10bpc).pass1_end2 +.fast: + call .pass1_main_fast + packssdw m0, m6, m7 + call .pass1_main_fast + packssdw m1, m6, m7 + call .pass1_main_fast + packssdw m2, m6, m7 + call .pass1_main_fast + packssdw m3, m6, m7 + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckldq m3, m4, m1 + punpckhdq m4, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + pxor m7, m7 + vshufi32x4 m2, m0, m3, q3131 + vshufi32x4 m0, m3, q2020 + vshufi32x4 m3, m1, m4, q3131 + vshufi32x4 m1, m4, q2020 + REPX {mova x, m7}, m4, m5, m6 + jmp m(idct_16x16_internal_10bpc).pass1_end3 +.pass2: + movshdup m14, [o(permC)] + vpbroadcastd m15, [o(pw_1697x16)] + lea r6, [strideq*3] + vpbroadcastd m11, [o(pw_2048)] + pxor m12, m12 + vpbroadcastd m13, [pixel_10bpc_max] + vpermq m8, m14, m0 + vpermq m9, m14, m1 + call .pass2_main + vpermq m8, m14, m2 + vpermq m9, m14, m3 + call .pass2_main + vpermq m8, m14, m4 + vpermq m9, m14, m5 + call .pass2_main + vpermq m8, m14, m6 + vpermq m9, m14, m7 +.pass2_main: + pmulhrsw m0, m15, m8 + pmulhrsw m1, m15, m9 + paddsw m8, m8 + paddsw m9, m9 + paddsw m8, m0 + paddsw m9, m1 + jmp m(idct_16x8_internal_10bpc).write_16x4 +ALIGN function_align +.pass1_main: + pmulld m6, m10, [r6+64*0] + pmulld m7, m10, [r6+64*1] + pmulld m8, m10, [r6+64*8] + pmulld m9, m10, [r6+64*9] + add r6, 64*2 + REPX {paddd x, m11}, m6, m7, m8, m9 + REPX {psrad x, 13 }, m6, m8, m7, m9 + ret +ALIGN function_align +.pass1_main_fast: + mova ym6, [r6+64* 0] + vinserti32x8 m6, [r6+64* 4], 1 + mova ym7, [r6+64* 8] + vinserti32x8 m7, [r6+64*12], 1 + add r6, 64 + REPX {pmulld x, m10}, m6, m7 + REPX {paddd x, m11}, m6, m7 + REPX {psrad x, 13 }, m6, m7 + ret + +cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + vpbroadcastd m11, [o(pd_2)] + mova m20, [o(idct8x32p)] + pxor m21, m21 + cmp eobd, 43 + jl .fast + call .pass1_main + punpcklwd m16, m0, m1 + punpcklwd m17, m2, m3 + punpckhwd m18, m0, m1 + punpckhwd m19, m2, m3 + cmp eobd, 107 + jge .full + punpckldq m0, m16, m17 ; 0 2 + punpckhdq m1, m16, m17 ; 4 6 + punpckldq m2, m18, m19 ; 8 10 + punpckhdq m3, m18, m19 ; 12 14 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + vextracti32x8 ym16, m2, 1 + vextracti32x8 ym17, m3, 1 + call m(idct_8x16_internal_8bpc).main_fast + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast + jmp .end +.full: + add cq, 64 + call .pass1_main + punpcklwd m5, m0, m1 + punpcklwd m6, m2, m3 + punpckhwd m7, m0, m1 + punpckhwd m8, m2, m3 + punpckldq m0, m16, m17 ; 0 2 + punpckhdq m1, m16, m17 ; 4 6 + punpckldq m2, m18, m19 ; 8 10 + punpckhdq m3, m18, m19 ; 12 14 + punpckldq m4, m5, m6 ; 16 18 + punpckhdq m5, m6 ; 20 22 + punpckldq m6, m7, m8 ; 24 26 + punpckhdq m7, m8 ; 28 30 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + vextracti32x8 ym16, m2, 1 + vextracti32x8 ym17, m3, 1 + vextracti32x8 ym18, m4, 1 + vextracti32x8 ym19, m5, 1 + vextracti32x8 ym20, m6, 1 + vextracti32x8 ym21, m7, 1 + call m(idct_8x16_internal_8bpc).main + REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main + jmp .end +.fast: + movshdup m8, [o(permB)] + mova ym1, [cq+128*1] + mova ym5, [cq+128*5] + mova ym7, [cq+128*3] + mova ym3, [cq+128*7] + mova ym0, [cq+128*0] + mova ym4, [cq+128*2] + mova ym2, [cq+128*4] + mova ym6, [cq+128*6] + vpermt2q m1, m8, m5 ; 1 5 + vpermt2q m3, m8, m7 ; 7 3 + vpermt2q m0, m8, m4 ; 0 2 + vpermt2q m2, m8, m6 ; 4 6 + mova [cq+128*0], ym21 + REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_10bpc).main_end + packssdw m0, m2 + packssdw m1, m3 + vpermb m0, m20, m0 + vprold m20, 16 + vpermb m2, m20, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + call m(idct_8x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2 +.end: + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper + lea r3, [strideq*2] + vpbroadcastd m12, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m11, m11 + lea r3, [dstq+r3*8] + pmulhrsw m0, m10 + pmulhrsw m1, m10 + call .write_8x4x2 + pmulhrsw m0, m10, m2 + pmulhrsw m1, m10, m3 + call .write_8x4x2 + pmulhrsw m0, m10, m4 + pmulhrsw m1, m10, m5 + call .write_8x4x2 + pmulhrsw m0, m10, m6 + pmulhrsw m1, m10, m7 +.write_8x4x2: + mova xm8, [dstq+strideq*0] + vinserti32x4 ym8, [dstq+strideq*1], 1 + vinserti32x4 m8, [dstq+strideq*2], 2 + vinserti32x4 m8, [dstq+r6 ], 3 + mova xm9, [r3 +r6 ] + vinserti32x4 ym9, [r3 +strideq*2], 1 + vinserti32x4 m9, [r3 +strideq*1], 2 + vinserti32x4 m9, [r3 +strideq*0], 3 + paddw m8, m0 + paddw m9, m1 + pmaxsw m8, m11 + pmaxsw m9, m11 + pminsw m8, m12 + pminsw m9, m12 + mova [dstq+strideq*0], xm8 + vextracti32x4 [dstq+strideq*1], ym8, 1 + vextracti32x4 [dstq+strideq*2], m8, 2 + vextracti32x4 [dstq+r6 ], m8, 3 + lea dstq, [dstq+strideq*4] + vextracti32x4 [r3 +strideq*0], m9, 3 + vextracti32x4 [r3 +strideq*1], m9, 2 + vextracti32x4 [r3 +strideq*2], ym9, 1 + mova [r3 +r6 ], xm9 + lea r3, [r3+strideq*4] + ret +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 +ALIGN function_align +.pass1_main: + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(idct_8x16_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_end2 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + REPX {vpermb x, m20, x}, m0, m1, m2, m3 + ret + +cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob + vpbroadcastd m9, [pw_5] + lea r4, [strideq*3] + pxor m10, m10 + lea r5, [strideq*5] + vpbroadcastd m11, [pixel_10bpc_max] + sub eobd, 107 + lea r6, [strideq+r4*2] +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + lea r7, [dstq+strideq*8] + REPX {mova [cq+128*x], m10}, 0, 1, 2, 3 + REPX {paddsw x, m9}, m0, m1, m2, m3 + REPX {mova [cq+128*x], m10}, 4, 5, 6, 7 + REPX {psraw x, 3 }, m0, m1, m2, m3 + add cq, 64 + mova xm4, [dstq+strideq*0] + mova xm5, [dstq+strideq*1] + mova xm6, [dstq+strideq*2] + mova xm7, [dstq+r4 *1] + punpckhwd m8, m0, m1 + vinserti32x4 ym4, [dstq+strideq*4], 1 + punpcklwd m0, m1 + vinserti32x4 ym5, [dstq+r5 *1], 1 + punpckhwd m1, m2, m3 + vinserti32x4 ym6, [dstq+r4 *2], 1 + punpcklwd m2, m3 + vinserti32x4 ym7, [dstq+r6 *1], 1 + punpckhwd m3, m0, m8 + vinserti32x4 m4, [r7 +strideq*0], 2 + punpcklwd m0, m8 + vinserti32x4 m5, [r7 +strideq*1], 2 + punpckhwd m8, m2, m1 + vinserti32x4 m6, [r7 +strideq*2], 2 + punpcklwd m2, m1 + vinserti32x4 m7, [r7 +r4 *1], 2 + punpckhqdq m1, m0, m2 + vinserti32x4 m4, [r7 +strideq*4], 3 + punpcklqdq m0, m2 + vinserti32x4 m5, [r7 +r5 *1], 3 + punpcklqdq m2, m3, m8 + vinserti32x4 m6, [r7 +r4 *2], 3 + punpckhqdq m3, m8 + vinserti32x4 m7, [r7 +r6 *1], 3 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + REPX {pmaxsw x, m10}, m0, m1, m2, m3 + REPX {pminsw x, m11}, m0, m1, m2, m3 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + mova [dstq+strideq*2], xm2 + mova [dstq+r4 *1], xm3 + vextracti32x4 [dstq+strideq*4], ym0, 1 + vextracti32x4 [dstq+r5 *1], ym1, 1 + vextracti32x4 [dstq+r4 *2], ym2, 1 + vextracti32x4 [dstq+r6 *1], ym3, 1 + lea dstq, [r7+strideq*8] + vextracti32x4 [r7 +strideq*0], m0, 2 + vextracti32x4 [r7 +strideq*1], m1, 2 + vextracti32x4 [r7 +strideq*2], m2, 2 + vextracti32x4 [r7 +r4 *1], m3, 2 + vextracti32x4 [r7 +strideq*4], m0, 3 + vextracti32x4 [r7 +r5 *1], m1, 3 + vextracti32x4 [r7 +r4 *2], m2, 3 + vextracti32x4 [r7 +r6 *1], m3, 3 + add eobd, 0x80000000 + jnc .loop + RET + +cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + mova m11, [o(permB)] + mova m0, [cq+64* 0] ; 0 1 + mova m4, [cq+64* 1] ; 2 3 + mova m1, [cq+64* 2] ; 4 5 + mova m8, [cq+64* 3] ; 6 7 + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + psrlq m10, m11, 32 +%if WIN64 + movaps [cq+16*0], xmm6 + movaps [cq+16*1], xmm7 +%endif + mova m16, m11 + vpermi2q m16, m0, m1 ; 1 5 + mova m17, m11 + vpermi2q m17, m8, m4 ; 7 3 + cmp eobd, 43 + jl .fast + mova m18, [cq+64* 4] ; 8 9 + mova m20, [cq+64* 5] ; 10 11 + mova m6, [cq+64* 6] ; 12 13 + mova m7, [cq+64* 7] ; 14 15 + vpermt2q m0, m10, m18 ; 0 8 + vpermt2q m18, m11, m6 ; 9 13 + mova m19, m11 + vpermi2q m19, m7, m20 ; 15 11 + cmp eobd, 107 + jge .full + vpermt2q m1, m10, m6 ; 4 12 + vpermt2q m4, m10, m8 ; 2 6 + vpermt2q m7, m10, m20 ; 14 10 + mov r6d, 64*1 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + call .main_fast + call m(idct_16x16_internal_10bpc).main_end + jmp .end +.full: + mova m2, [cq+64* 8] ; 16 17 + mova m5, [cq+64* 9] ; 18 19 + mova m9, [cq+64*10] ; 20 21 + mova m21, [cq+64*11] ; 22 23 + vpermt2q m1, m10, m9 ; 4 20 + vpermt2q m7, m10, m21 ; 14 22 + vpermt2q m21, m11, m5 ; 23 19 + vpermt2q m5, m10, m20 ; 18 10 + mova m20, m11 + vpermi2q m20, m2, m9 ; 17 21 + mova m22, [cq+64*12] ; 24 25 + mova m9, [cq+64*13] ; 26 27 + mova m3, [cq+64*14] ; 28 29 + mova m23, [cq+64*15] ; 30 31 + vpermt2q m2, m10, m22 ; 16 24 + vpermt2q m22, m11, m3 ; 25 29 + vpermt2q m3, m10, m6 ; 28 12 + vpermt2q m4, m10, m9 ; 2 26 + mova m6, m10 + vpermi2q m6, m23, m8 ; 30 6 + vpermt2q m23, m11, m9 ; 31 27 + mov r6d, 64*3 + call m(idct_8x8_internal_10bpc).main + call m(idct_16x8_internal_10bpc).main + call .main + call m(idct_16x16_internal_10bpc).main_end + jmp .end +.fast: + vpermq m0, m10, m0 ; 0 0 + vpermq m1, m10, m1 ; 4 4 + vpermt2q m4, m10, m8 ; 2 6 + xor r6d, r6d + call .main_fast2 + call m(idct_16x16_internal_10bpc).main_end +.end: +%if WIN64 + movaps xmm6, [cq+16*0] + movaps xmm7, [cq+16*1] +%endif + vzeroupper + call .transpose_8x32 + pxor m14, m14 +.zero_loop: + mova [cq+r6*4+64*3], m14 + mova [cq+r6*4+64*2], m14 + mova [cq+r6*4+64*1], m14 + mova [cq+r6*4+64*0], m14 + sub r6d, 64 + jge .zero_loop + lea r5, [o_base_8bpc] + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + pxor m12, m12 +.write_32x8_start: + vpbroadcastd m11, [pw_2048] + vpbroadcastd m13, [pixel_10bpc_max] + lea r3, [strideq*3] +.write_32x8: + pmulhrsw m0, m11 + pmulhrsw m1, m11 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + call .write_32x4 + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 +.write_32x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3 ] + REPX {pmaxsw x, m12}, m0, m1, m2, m3 + REPX {pminsw x, m13}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] + ret +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 8 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2 +ALIGN function_align +.main_fast2: ; bottom three-quarters are zero + vbroadcasti32x4 m8, [o(pd_799_4017)] + pmulld m8, m1 ; t4 t7 + vpmulld m0, [o(pd_2896)] {1to16} ; t0 t1 + REPX {paddd x, m13}, m8, m0 + REPX {psrad x, 12 }, m8, m0 + pmulld m3, m8, m12 + mova m2, m0 ; t3 t2 + call m(idct_8x8_internal_10bpc).main3 + vbroadcasti32x4 m6, [o(pd_4076_3920)] + vbroadcasti32x4 m3, [o(pd_401_m1189)] + pmulld m6, m4 ; t15 t12 + pmulld m4, m3 ; t9 t10 + REPX {paddd x, m13}, m6, m4 + REPX {psrad x, 12 }, m6, m4 + mova m5, m6 ; t14 t13 + mova m9, m4 ; t8 t11 + call m(idct_16x8_internal_10bpc).main3 + vbroadcasti32x4 m23, [o(pd_4091_3973)] + vbroadcasti32x4 m7, [o(pd_201_995)] + vbroadcasti32x4 m22, [o(pd_1380_601)] + vbroadcasti32x4 m9, [o(pd_3857_4052)] + pmulld m23, m16 ; t16 t20 + pmulld m16, m7 ; t31 t27 + pmulld m22, m17 ; -t19 -t25 + pmulld m17, m9 ; t28 t24 + REPX {paddd x, m13}, m23, m16, m17 + psubd m22, m13, m22 + REPX {psrad x, 12 }, m23, m16, m22, m17 + mova m20, m23 ; t30 t26 + mova m9, m16 ; t17 t21 + mova m19, m22 ; t18 t22 + mova m18, m17 ; t29 t25 + jmp .main3 +.main_fast: ; bottom half is zero + vbroadcasti32x4 m23, [o(pd_4091_3973)] + vbroadcasti32x4 m7, [o(pd_201_995)] + vbroadcasti32x4 m20, [o(pd_2751_2106)] + vbroadcasti32x4 m9, [o(pd_3035_3513)] + vbroadcasti32x4 m21, [o(pd_3703_3290)] + vbroadcasti32x4 m10, [o(pd_1751_2440)] + vbroadcasti32x4 m22, [o(pd_1380_601)] + vbroadcasti32x4 m11, [o(pd_3857_4052)] + pmulld m23, m16 ; t16a t20a + pmulld m16, m7 ; t31a t27a + pmulld m20, m19 ; -t17a -t21a + pmulld m19, m9 ; t30a t26a + pmulld m21, m18 ; t18a t22a + pmulld m18, m10 ; t29a t25a + pmulld m22, m17 ; -t19a -t25a + pmulld m17, m11 ; t28a t24a + psubd m20, m13, m20 + psubd m22, m13, m22 + jmp .main2 +.main: + ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973 + ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106 + ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290 + ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601 + paddd m20, m13 + paddd m22, m13 +.main2: + REPX {paddd x, m13}, m16, m23, m19 + REPX {psrad x, 12 }, m16, m20, m23, m19 + psubd m9, m16, m20 ; t17 t21 + paddd m16, m20 ; t16 t20 + psubd m20, m23, m19 ; t30 t26 + paddd m23, m19 ; t31 t27 + REPX {pmaxsd x, m14}, m9, m16, m20, m23 + REPX {paddd x, m13}, m21, m18, m17 + REPX {psrad x, 12 }, m18, m22, m21, m17 + psubd m19, m22, m18 ; t18 t22 + paddd m22, m18 ; t19 t23 + psubd m18, m17, m21 ; t29 t25 + paddd m17, m21 ; t28 t24 + REPX {pmaxsd x, m14}, m19, m22, m18, m17 + REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17 +.main3: + vbroadcasti32x4 m11, [o(pd_4017_2276)] + vbroadcasti32x4 m10, [o(pd_799_3406)] + psubd m7, m0, m6 ; dct16 out15 out14 + paddd m0, m6 ; dct16 out0 out1 + psubd m6, m1, m5 ; dct16 out12 out13 + paddd m1, m5 ; dct16 out3 out2 + psubd m5, m2, m4 ; dct16 out11 out10 + paddd m2, m4 ; dct16 out4 out5 + psubd m4, m3, m8 ; dct16 out8 out9 + paddd m3, m8 ; dct16 out7 out6 + ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11 + ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2 + REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 + punpckhqdq m21, m16, m20 ; t20 t21a + punpcklqdq m16, m20 ; t16 t17a + punpcklqdq m20, m22, m19 ; t19 t18a + punpckhqdq m22, m19 ; t23 t22a + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + punpcklqdq m19, m23, m9 ; t31 t30a + punpckhqdq m23, m9 ; t27 t26a + punpckhqdq m9, m17, m18 ; t24 t25a + punpcklqdq m17, m18 ; t28 t29a + vpbroadcastd m11, [o(pd_3784)] + vpbroadcastd m10, [o(pd_1567)] + psubd m18, m16, m20 ; t19a t18 + paddd m20, m16 ; t16a t17 + psubd m16, m19, m17 ; t28a t29 + paddd m19, m17 ; t31a t30 + psubd m17, m22, m21 ; t20a t21 + paddd m22, m21 ; t23a t22 + psubd m21, m9, m23 ; t27a t26 + paddd m23, m9 ; t24a t25 + REPX {pmaxsd x, m14}, m18, m16, m17, m21 + REPX {pminsd x, m15}, m16, m18, m21, m17 + ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11 + ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2 + REPX {pmaxsd x, m14}, m20, m22, m19, m23 + REPX {pminsd x, m15}, m20, m22, m19, m23 + paddd m9, m20, m22 ; t16 t17a + psubd m20, m22 ; t23 t22a + paddd m22, m19, m23 ; t31 t30a + psubd m19, m23 ; t24 t25a + psubd m23, m16, m17 ; t20a t21 + paddd m16, m17 ; t19a t18 + psubd m17, m18, m21 ; t27a t26 + paddd m21, m18 ; t28a t29 + REPX {pmaxsd x, m14}, m20, m19, m23, m17 + REPX {pminsd x, m15}, m19, m20, m17, m23 + REPX {pmulld x, m12}, m19, m20, m17, m23 + REPX {pmaxsd x, m14}, m22, m21, m16, m9 + paddd m19, m13 + paddd m17, m13 + REPX {pminsd x, m15}, m22, m21, m16, m9 + psubd m18, m19, m20 ; t23a t22 + paddd m19, m20 ; t24a t25 + paddd m20, m17, m23 ; t27 t26a + psubd m17, m23 ; t20 t21a + REPX {psrad x, 12 }, m20, m19, m18, m17 + ret +.transpose_8x32: + mova m10, [o(idct32x8p)] + psrlw m8, m10, 8 + mova m9, m8 + vpermi2w m8, m1, m5 + vpermt2w m1, m10, m5 + vprold m5, m9, 16 + vpermi2w m9, m3, m7 + vpermt2w m3, m10, m7 + vprold m10, 16 + mova m7, m5 + vpermi2w m5, m0, m4 + vpermt2w m0, m10, m4 + vpermi2w m7, m2, m6 + vpermt2w m2, m10, m6 + punpckhdq m6, m5, m8 + punpckldq m5, m8 + punpckhdq m8, m7, m9 + punpckldq m7, m9 + punpckhdq m4, m2, m3 + punpckldq m2, m3 + punpckhdq m3, m0, m1 + punpckldq m0, m1 + ret + +cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob + vpbroadcastd m5, [pw_4096] + lea r4, [strideq*3] + mova m6, [idtx32x8p] + lea r5, [strideq*5] + vpbroadcastd m9, [pixel_10bpc_max] + lea r6, [strideq+r4*2] + pxor m8, m8 + sub eobd, 107 + psrlw m7, m6, 8 +.loop: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] ; 02 13 + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] ; 46 57 + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] ; 8a 9b + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] ; ce df + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {mova [cq+64*x], m8}, 0, 1, 2, 3 + mova m4, m6 + vpermi2w m4, m1, m3 + vpermt2w m1, m7, m3 + REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 + mova m3, m7 + vpermi2w m3, m0, m2 + vpermt2w m0, m6, m2 + add cq, 64*8 + punpcklqdq m2, m3, m1 ; 4 5 + punpckhqdq m3, m1 ; 6 7 + punpckhqdq m1, m0, m4 ; 2 3 + punpcklqdq m0, m4 ; 0 1 + mova ym4, [dstq+strideq*0] + vinserti32x8 m4, [dstq+strideq*1], 1 + paddw m0, m4 + mova ym4, [dstq+strideq*2] + vinserti32x8 m4, [dstq+r4 *1], 1 + paddw m1, m4 + mova ym4, [dstq+strideq*4] + vinserti32x8 m4, [dstq+r5 *1], 1 + paddw m2, m4 + mova ym4, [dstq+r4 *2] + vinserti32x8 m4, [dstq+r6 *1], 1 + paddw m3, m4 + REPX {pmaxsw x, m8}, m0, m1, m2, m3 + REPX {pminsw x, m9}, m0, m1, m2, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+r4 *1], m1, 1 + mova [dstq+strideq*4], ym2 + vextracti32x8 [dstq+r5 *1], m2, 1 + mova [dstq+r4 *2], ym3 + vextracti32x8 [dstq+r6 *1], m3, 1 + add dstq, 32 + add eobd, 0x80000000 + jnc .loop + RET + +cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] +%if WIN64 + movaps [rsp+ 8], xmm6 + movaps [rsp+24], xmm7 +%endif + cmp eobd, 36 + jl .fast + call .pass1 + cmp eobd, 151 + jge .full + lea r5, [o_base_8bpc] + pxor m9, m9 + punpcklwd m8, m1, m1 ; 2 + punpckhwd m14, m1, m1 ; 3 + punpcklwd m1, m3, m3 ; 6 + punpckhwd m15, m3, m3 ; 7 + punpcklwd m3, m6, m6 ; 12 + punpckhwd m19, m6, m6 ; 13 + punpcklwd m6, m9, m4 ; __ 8 + punpckhwd m20, m4, m4 ; 9 + punpckhwd m16, m5, m5 ; 11 + punpcklwd m5, m5 ; 10 + punpcklwd m9, m0 ; __ 0 + punpckhwd m21, m0, m0 ; 1 + punpcklwd m0, m7, m7 ; 14 + punpckhwd m17, m7, m7 ; 15 + punpcklwd m7, m2, m2 ; 4 + punpckhwd m18, m2, m2 ; 5 + call m(idct_16x16_internal_8bpc).main_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mov r6d, 64*3 + pxor m8, m8 +.zero_loop: + REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0 + sub r6d, 64 + jge .zero_loop + jmp .pass2_end +.full: + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 64 + call .pass1 + mova m9, [cq-64* 1] ; 0 1 + mova m14, [cq+64* 1] ; 2 3 + mova m18, [cq+64* 3] ; 4 5 + mova m15, [cq+64* 5] ; 6 7 + mova m20, [cq+64* 7] ; 8 9 + mova m16, [cq+64* 9] ; 10 11 + mova m22, [cq+64*11] ; 12 13 + mova m19, [cq+64*13] ; 14 15 + lea r5, [o_base_8bpc] + punpcklwd m8, m7, m14 ; 30 2 + punpckhwd m21, m7, m9 ; 31 1 + punpcklwd m7, m6, m18 ; 28 4 + punpckhwd m14, m6 ; 3 29 + punpcklwd m9, m0, m9 ; 16 0 + punpckhwd m17, m19, m0 ; 15 17 + punpcklwd m0, m19, m1 ; 14 18 + punpckhwd m19, m1, m22 ; 19 13 + punpcklwd m1, m15, m5 ; 6 26 + punpckhwd m18, m5, m18 ; 27 5 + punpcklwd m6, m4, m20 ; 24 8 + punpckhwd m15, m4 ; 7 25 + punpcklwd m5, m3, m16 ; 22 10 + punpckhwd m20, m3, m20 ; 23 9 + punpcklwd m3, m22, m2 ; 12 20 + punpckhwd m16, m2 ; 11 21 + call m(idct_16x16_internal_8bpc).main2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + mov r6d, 32*7 + pxor m8, m8 +.full_zero_loop: + REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1 + sub r6d, 32 + jge .full_zero_loop + jmp .pass2_end +.fast: + mova ym0, [cq+128*0] + mova ym2, [cq+128*4] + movshdup m8, [o(permB)] + mova ym1, [cq+128*2] + mova ym3, [cq+128*6] + mova ym4, [cq+128*1] + mova ym5, [cq+128*3] + mova ym6, [cq+128*5] + mova ym7, [cq+128*7] + vpermt2q m0, m8, m2 ; 0 4 + vpermt2q m1, m8, m3 ; 2 6 + vpermt2q m4, m8, m5 ; 1 3 + vpermt2q m7, m8, m6 ; 7 5 + REPX {pmulld x, m12}, m0, m1, m4, m7 + pxor ym16, ym16 + mova [cq+128*0], ym16 + REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7 + REPX {paddd x, m13}, m0, m1, m4, m7 + REPX {psrad x, 12 }, m0, m1, m4, m7 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + vpbroadcastd m11, [o(pd_1)] + call m(idct_8x16_internal_10bpc).main_end2 + mova m8, [o(idct8x32p)] + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m6, [dup16_perm] + vpermb m0, m8, m0 + vpermb m2, m8, m2 + vprold m8, 16 + vpermb m1, m8, m1 + vpermb m3, m8, m3 + punpckldq m4, m0, m2 + punpckhdq m0, m2 + punpckldq m2, m1, m3 + punpckhdq m1, m3 + punpckldq m21, m4, m2 + punpckhdq m14, m4, m2 + punpckldq m18, m0, m1 + punpckhdq m15, m0, m1 + vpermb m8, m6, m14 ; 2 + vpermb m1, m6, m15 ; 6 + vpermb m7, m6, m18 ; 4 + pmovzxwd m9, ym21 ; 0 + vpord m6, [o(pb_32)] {1to16} + lea r5, [o_base_8bpc] + vpermb m21, m6, m21 ; 1 + vpermb m15, m6, m15 ; 7 + vpermb m18, m6, m18 ; 5 + vpermb m14, m6, m14 ; 3 + pslld m9, 16 + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 +.pass2_end: + movshdup m22, [permC] + vpbroadcastd m11, [pw_2048] + vpbroadcastd m13, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m12, m12 + psrlq m23, m22, 8 + vpermq m8, m22, m0 + vpermq m9, m23, m1 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m2 + vpermq m9, m23, m3 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m4 + vpermq m9, m23, m5 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m6 + vpermq m9, m23, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m14 + vpermq m9, m23, m15 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m16 + vpermq m9, m23, m17 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m18 + vpermq m9, m23, m19 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m20 + vpermq m9, m23, m21 +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + vzeroupper + jmp m(idct_16x8_internal_10bpc).write_16x4 +.pass1: + pmulld m0, m12, [cq+128* 0] + pmulld m1, m12, [cq+128* 2] + pmulld m2, m12, [cq+128* 4] + pmulld m3, m12, [cq+128* 6] + pmulld m4, m12, [cq+128* 8] + pmulld m5, m12, [cq+128*10] + pmulld m6, m12, [cq+128*12] + pmulld m7, m12, [cq+128*14] + call m(idct_8x16_internal_10bpc).main_rect2 + pmulld m16, m12, [cq+128* 1] + pmulld m17, m12, [cq+128* 3] + pmulld m18, m12, [cq+128* 5] + pmulld m19, m12, [cq+128* 7] + pmulld m20, m12, [cq+128* 9] + pmulld m21, m12, [cq+128*11] + pmulld m22, m12, [cq+128*13] + pmulld m23, m12, [cq+128*15] + call m(idct_16x16_internal_10bpc).main_rect2 + vpbroadcastd m11, [o(pd_1)] + call m(idct_16x16_internal_10bpc).main_end2 + jmp m(idct_16x16_internal_10bpc).main_end3 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly + +cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob +%undef cmp + vpbroadcastd m10, [pw_2896x8] + vpbroadcastd m11, [pw_1697x16] + vpbroadcastd m13, [pw_8192] + vpbroadcastd m15, [pixel_10bpc_max] + lea r6, [strideq*9] + pxor m14, m14 + paddw m12, m13, m13 ; pw_16384 + cmp eobd, 151 + jl .main + call .main + add cq, 64-128*4 + lea dstq, [dstq+strideq*8] +.main: + call .main_internal + add cq, 128*4 + pmulhrsw m1, m13, m2 + pmulhrsw m3, m13, m4 + pmulhrsw m5, m13, m6 + pmulhrsw m7, m13, m8 + call .main_internal +.main2: + pmulhrsw m2, m13 + pmulhrsw m4, m13 + pmulhrsw m6, m13 + pmulhrsw m8, m13 + punpcklqdq m0, m1, m2 ; 0 8 + punpckhqdq m1, m2 ; 1 9 + call .write_16x2x2 + punpcklqdq m0, m3, m4 ; 2 10 + punpckhqdq m1, m3, m4 ; 3 11 + call .write_16x2x2 + punpcklqdq m0, m5, m6 ; 4 12 + punpckhqdq m1, m5, m6 ; 5 13 + call .write_16x2x2 + punpcklqdq m0, m7, m8 ; 6 14 + punpckhqdq m1, m7, m8 ; 7 15 +.write_16x2x2: + mova ym2, [dstq+strideq*0] + vinserti32x8 m2, [dstq+strideq*8], 1 + mova ym9, [dstq+strideq*1] + vinserti32x8 m9, [dstq+r6 ], 1 + paddw m0, m2 + paddw m1, m9 + pmaxsw m0, m14 + pmaxsw m1, m14 + pminsw m0, m15 + pminsw m1, m15 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*8], m0, 1 + mova [dstq+strideq*1], ym1 + vextracti32x8 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*2] + ret +.main_internal: + mova m8, [cq+128* 0] + packssdw m8, [cq+128* 8] + mova m6, [cq+128* 1] + packssdw m6, [cq+128* 9] + mova m0, [cq+128* 2] + packssdw m0, [cq+128*10] + mova m2, [cq+128* 3] + packssdw m2, [cq+128*11] + REPX {pmulhrsw x, m10}, m8, m6, m0, m2 + REPX {vpermq x, x, q3120}, m8, m6, m0, m2 + pmulhrsw m4, m11, m8 + pmulhrsw m9, m11, m6 + REPX {mova [cq+128*x], m14}, 0, 1, 2, 3 + pmulhrsw m4, m12 + pmulhrsw m9, m12 + paddsw m8, m4 + paddsw m6, m9 + pmulhrsw m4, m11, m0 + pmulhrsw m9, m11, m2 + REPX {mova [cq+128*x], m14}, 8, 9, 10, 11 + pmulhrsw m4, m12 + pmulhrsw m9, m12 + paddsw m0, m4 + paddsw m2, m9 + punpcklwd m4, m8, m6 + punpckhwd m8, m6 + punpcklwd m6, m0, m2 + punpckhwd m0, m2 + punpckldq m2, m4, m6 ; 0 1 + punpckhdq m4, m6 ; 2 3 + punpckldq m6, m8, m0 ; 4 5 + punpckhdq m8, m0 ; 6 7 + ret + +cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] +%if WIN64 + movaps [rsp+ 8], xmm6 + movaps [rsp+24], xmm7 +%endif + mov r6d, 8*12 + cmp eobd, 36 + jl .fast + pmulld m0, m12, [cq+64* 0] + pmulld m1, m12, [cq+64* 4] + pmulld m2, m12, [cq+64* 8] + pmulld m3, m12, [cq+64*12] + pmulld m16, m12, [cq+64* 2] + pmulld m17, m12, [cq+64* 6] + pmulld m18, m12, [cq+64*10] + pmulld m19, m12, [cq+64*14] + cmp eobd, 151 + jge .full + call m(idct_8x16_internal_10bpc).main_fast_rect2 + call m(idct_16x16_internal_10bpc).main_fast_rect2 + call .idct16_sumsub + call .pass1_load_spill + call .main_fast_rect2 + jmp .pass1_end +.full: + pmulld m4, m12, [cq+64*16] + pmulld m5, m12, [cq+64*20] + pmulld m6, m12, [cq+64*24] + pmulld m7, m12, [cq+64*28] + pmulld m20, m12, [cq+64*18] + pmulld m21, m12, [cq+64*22] + pmulld m22, m12, [cq+64*26] + pmulld m23, m12, [cq+64*30] + add r6d, 8*16 + call m(idct_8x16_internal_10bpc).main_rect2 + call m(idct_16x16_internal_10bpc).main_rect2 + call .idct16_sumsub + call .pass1_load_spill + pmulld m16, m12, [cq+64*17] + pmulld m17, m12, [cq+64*19] + pmulld m18, m12, [cq+64*21] + pmulld m19, m12, [cq+64*23] + pmulld m20, m12, [cq+64*25] + pmulld m21, m12, [cq+64*27] + pmulld m22, m12, [cq+64*29] + pmulld m23, m12, [cq+64*31] + call .main_rect2 +.pass1_end: + vpbroadcastd m11, [o(pd_1)] + lea r4, [cq+64] + call .idct32_pass1_end + lea r5, [o_base_8bpc] + punpckhqdq m19, m5, m16 ; 11 + punpcklqdq m5, m16 ; 10 + punpckhqdq m16, m2, m1 ; 5 + punpcklqdq m2, m1 ; 4 + punpcklqdq m1, m15, m4 ; 2 + punpckhqdq m15, m4 ; 3 + punpcklqdq m4, m14, m18 ; 8 + punpckhqdq m18, m14, m18 ; 9 + punpckhqdq m14, m0, m20 ; 1 + punpcklqdq m0, m20 ; 0 + punpckhqdq m20, m6, m17 ; 13 + punpcklqdq m6, m17 ; 12 + punpckhqdq m17, m3, m21 ; 7 + punpcklqdq m3, m21 ; 6 + punpckhqdq m21, m7, m8 ; 15 + punpcklqdq m7, m8 ; 14 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + jmp .end +.fast: + pmulld ym0, ym12, [cq+64*0] + pmulld ym1, ym12, [cq+64*4] + movshdup m7, [o(permB)] + mova ym4, [cq+64*2] + mova ym5, [cq+64*6] + mova ym16, [cq+64*1] + mova ym2, [cq+64*5] + mova ym3, [cq+64*3] + mova ym17, [cq+64*7] + vpermt2q m4, m7, m5 ; 2 6 + vpermt2q m16, m7, m2 ; 1 5 + vpermt2q m17, m7, m3 ; 7 3 + paddd ym0, ym13 + paddd ym1, ym13 + psrad ym0, 12 + psrad ym1, 12 + vpermq m0, m7, m0 ; 0 0 + vpermq m1, m7, m1 ; 4 4 + REPX {pmulld x, m12}, m4, m16, m17 + REPX {paddd x, m13}, m4, m16, m17 + REPX {psrad x, 12 }, m4, m16, m17 + call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 + vpbroadcastd m11, [o(pd_1)] + call m(idct_16x16_internal_10bpc).main_end2 + call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 + lea r5, [o_base_8bpc] + punpckhqdq m14, m0, m2 ; 1 + punpcklqdq m0, m2 ; 0 + punpcklqdq m1, m3, m4 ; 2 + punpckhqdq m15, m3, m4 ; 3 + punpcklqdq m2, m5, m7 ; 4 + punpckhqdq m16, m5, m7 ; 5 + punpcklqdq m3, m6, m8 ; 6 + punpckhqdq m17, m6, m8 ; 7 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast +.end: +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + pxor m12, m12 +.zero_loop: + mova [cq+r6*8+64*3], m12 + mova [cq+r6*8+64*2], m12 + mova [cq+r6*8+64*1], m12 + mova [cq+r6*8+64*0], m12 + sub r6d, 8*4 + jge .zero_loop + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start + pmulhrsw m0, m11, m14 + pmulhrsw m1, m11, m15 + pmulhrsw m2, m11, m16 + pmulhrsw m3, m11, m17 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 + pmulhrsw m0, m11, m18 + pmulhrsw m1, m11, m19 + pmulhrsw m2, m11, m20 + pmulhrsw m3, m11, m21 + vzeroupper + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 +.dconly2: + vpbroadcastd m3, [o(dconly_10bpc)] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw m2, r6d + paddsw m2, m3 +.dconly_loop: + paddsw m0, m2, [dstq+strideq*0] + paddsw m1, m2, [dstq+strideq*1] + psubusw m0, m3 + psubusw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +ALIGN function_align +.idct16_sumsub: + psubd m23, m0, m22 ; t15 + paddd m0, m22 ; t0 + psubd m22, m1, m21 ; t14 + paddd m1, m21 ; t1 + REPX {pmaxsd x, m14}, m23, m0, m22, m1 + psubd m21, m2, m20 ; t13 + paddd m2, m20 ; t2 + REPX {pminsd x, m15}, m23, m0, m22, m1 + psubd m20, m3, m19 ; t12 + paddd m3, m19 ; t3 + REPX {pmaxsd x, m14}, m21, m2, m20, m3 + psubd m19, m4, m18 ; t11 + paddd m4, m18 ; t4 + REPX {pminsd x, m15}, m21, m2, m20, m3 + psubd m18, m5, m17 ; t10 + paddd m5, m17 ; t5 + REPX {pmaxsd x, m14}, m19, m4, m18, m5 + psubd m17, m6, m16 ; t9 + paddd m6, m16 ; t6 + REPX {pminsd x, m15}, m19, m4, m18, m5 + psubd m16, m7, m9 ; t8 + paddd m7, m9 ; t7 + REPX {pmaxsd x, m14}, m17, m6, m16, m7 + REPX {pminsd x, m15}, m17, m6, m16, m7 + ret +.idct32_pass1_end: + psrlq m12, [o(permC)], 24 ; 0 2 8 10 1 3 9 11 + psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 +%macro IDCT32_PASS1_END 2 ; low, high + paddd m8, m11, [r4+128*%1] + paddd m9, m11, [cq+128*%1] + psubd m10, m8, m%1 ; out 16+n + paddd m8, m%1 ; out 15-n + paddd m%1, m9, m%2 ; out 0+n + psubd m9, m%2 ; out 31-n + REPX {vpsravd x, m11}, m10, m%1, m8, m9 + packssdw m%1, m10 ; 0+n 16+n + packssdw m%2, m8, m9 ; 15-n 31-n +%endmacro + IDCT32_PASS1_END 0, 23 ; 0 16, 15 31 + IDCT32_PASS1_END 7, 16 ; 7 23, 8 24 + mova m14, m13 + vpermi2q m14, m0, m16 + vpermt2q m0, m12, m16 + IDCT32_PASS1_END 1, 22 ; 1 17, 14 30 + IDCT32_PASS1_END 6, 17 ; 6 22, 9 25 + mova m15, m13 + vpermi2q m15, m1, m17 + vpermt2q m1, m12, m17 + IDCT32_PASS1_END 2, 21 ; 2 18, 13 29 + IDCT32_PASS1_END 5, 18 ; 5 21, 10 26 + mova m16, m13 + vpermi2q m16, m2, m18 + vpermt2q m2, m12, m18 + IDCT32_PASS1_END 3, 20 ; 3 19, 12 28 + IDCT32_PASS1_END 4, 19 ; 4 20, 11 27 + mova m17, m13 + vpermi2q m17, m3, m19 + vpermt2q m3, m12, m19 + mova m18, m13 + vpermi2q m18, m4, m20 + vpermt2q m4, m12, m20 + mova m19, m13 + vpermi2q m19, m5, m21 + vpermt2q m5, m12, m21 + mova m20, m13 + vpermi2q m20, m6, m22 + vpermt2q m6, m12, m22 + mova m21, m13 + vpermi2q m21, m7, m23 + vpermt2q m7, m12, m23 + punpckhwd m8, m2, m3 ; c04 d04 c05 d05 c06 d06 c07 d07 + punpcklwd m2, m3 ; c00 d00 c01 d01 c02 d02 c03 d03 + punpckhwd m3, m0, m1 ; a04 b04 a05 b05 a06 b06 a07 b07 + punpcklwd m0, m1 ; a00 b00 a01 b01 a02 b02 a03 b03 + punpckhwd m1, m4, m5 ; e04 f04 e05 f05 e06 f06 e07 f07 + punpcklwd m4, m5 ; e00 f00 e01 f01 e02 f02 e03 f03 + punpckhwd m5, m6, m7 ; g04 h04 g05 h05 g06 h06 g07 h07 + punpcklwd m6, m7 ; g00 h00 g01 h01 g02 h02 g03 h03 + punpckhwd m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15 + punpcklwd m14, m15 ; a08 b08 a09 b09 a10 b10 a11 b11 + punpckhwd m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15 + punpcklwd m16, m17 ; c08 d08 c09 d09 c10 d10 c11 d11 + punpckhwd m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15 + punpcklwd m18, m19 ; e08 f08 e09 f09 e10 f10 e11 f11 + punpckhwd m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15 + punpcklwd m20, m21 ; g08 h08 g09 h09 g10 h10 g11 h11 + punpckhdq m21, m1, m5 ; e06 f06 g06 h06 e07 f07 g07 h07 + punpckldq m1, m5 ; e04 f04 g04 h04 e05 f05 g05 h05 + punpckhdq m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11 + punpckldq m14, m16 ; a08 b08 c08 d08 a09 b09 c09 d09 + punpckhdq m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11 + punpckldq m18, m20 ; e08 f08 g08 h08 e09 f09 g09 h09 + punpckldq m20, m4, m6 ; e00 f00 g00 h00 e01 f01 g01 h01 + punpckhdq m4, m6 ; e02 f02 g02 h02 e03 f03 g03 h03 + punpckldq m6, m7, m15 ; a12 b12 c12 d12 a13 b13 c13 d13 + punpckhdq m7, m15 ; a14 b14 c14 d14 a15 b15 c15 d15 + punpckhdq m15, m0, m2 ; a02 b02 c02 d02 a03 b03 c03 d03 + punpckldq m0, m2 ; a00 b00 c00 d00 a01 b01 c01 d01 + punpckldq m2, m3, m8 ; a04 b04 c04 d04 a05 b05 c05 d05 + punpckhdq m3, m8 ; a06 b06 c06 d06 a07 b07 c07 d07 + punpckhdq m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15 + punpckldq m17, m19 ; e12 f12 g12 h12 e13 f13 g13 h13 + ret +.pass1_load_spill: + mova [cq+64* 0], m0 + mova [cq+64* 2], m1 + mova [cq+64* 4], m2 + mova [cq+64* 6], m3 + mova [cq+64* 8], m4 + mova [cq+64*10], m5 + mova [cq+64*12], m6 + mova [cq+64*14], m7 + pmulld m0, m12, [cq+64* 1] + pmulld m1, m12, [cq+64* 3] + pmulld m2, m12, [cq+64* 5] + pmulld m3, m12, [cq+64* 7] + pmulld m4, m12, [cq+64* 9] + pmulld m5, m12, [cq+64*11] + pmulld m6, m12, [cq+64*13] + pmulld m7, m12, [cq+64*15] + mova [cq+64* 1], m23 + mova [cq+64* 3], m22 + mova [cq+64* 5], m21 + mova [cq+64* 7], m20 + mova [cq+64* 9], m19 + mova [cq+64*11], m18 + mova [cq+64*13], m17 + mova [cq+64*15], m16 + ret +.main_fast_rect2: + call m(idct_8x16_internal_10bpc).round +.main_fast: ; bottom half is zero + pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a + pmulld m0, [o(pd_201)] {1to16} ; t16a + pmulld m16, m7, [o(pd_2751)] {1to16} ; t17a + pmulld m7, [o(pd_3035)] {1to16} ; t30a + pmulld m19, m4, [o(pd_3703)] {1to16} ; t29a + pmulld m4, [o(pd_1751)] {1to16} ; t18a + pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a + pmulld m3, [o(pd_3857)] {1to16} ; t28a + pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a + pmulld m2, [o(pd_995)] {1to16} ; t20a + pmulld m18, m5, [o(pd_2106)] {1to16} ; t21a + pmulld m5, [o(pd_3513)] {1to16} ; t26a + pmulld m17, m6, [o(pd_3290)] {1to16} ; t25a + pmulld m6, [o(pd_2440)] {1to16} ; t22a + pmulld m22, m1, [o(pd_601)] {1to16} ; t23a + pmulld m1, [o(pd_4052)] {1to16} ; t24a + REPX {psubd x, m13, x}, m16, m20, m18, m22 + call m(idct_16x16_internal_10bpc).round3 + jmp .main2 +.main_rect2: + call m(idct_8x16_internal_10bpc).round + call m(idct_16x16_internal_10bpc).round +.main: + ITX_MULSUB_2D 0, 23, 8, 9, 10, _, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 16, 7, 8, 9, 10, _, 3035, 2751 ; t17a, t30a + ITX_MULSUB_2D 4, 19, 8, 9, 10, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 20, 3, 8, 9, 10, _, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2D 2, 21, 8, 9, 10, _, 995, 3973 ; t20a, t27a + ITX_MULSUB_2D 18, 5, 8, 9, 10, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 6, 17, 8, 9, 10, _, 2440, 3290 ; t22a, t25a + ITX_MULSUB_2D 22, 1, 8, 9, 10, _, 4052, 601 ; t23a, t24a + call m(idct_16x16_internal_10bpc).round +.main2: + call m(idct_8x16_internal_10bpc).round + psubd m8, m0, m16 ; t17 + paddd m0, m16 ; t16 + psubd m16, m23, m7 ; t30 + paddd m23, m7 ; t31 + REPX {pmaxsd x, m14}, m8, m0, m16, m23 + paddd m7, m20, m4 ; t19 + psubd m20, m4 ; t18 + REPX {pminsd x, m15}, m8, m0, m16, m23 + paddd m4, m3, m19 ; t28 + psubd m3, m19 ; t29 + REPX {pmaxsd x, m14}, m7, m20, m4, m3 + psubd m19, m2, m18 ; t21 + paddd m2, m18 ; t20 + REPX {pminsd x, m15}, m7, m20, m4, m3 + psubd m18, m21, m5 ; t26 + paddd m21, m5 ; t27 + REPX {pmaxsd x, m14}, m19, m2, m18, m21 + psubd m5, m22, m6 ; t22 + paddd m6, m22 ; t23 + REPX {pminsd x, m15}, m19, m2, m18, m21 + psubd m22, m1, m17 ; t25 + paddd m17, m1 ; t24 + REPX {pmaxsd x, m14}, m5, m6, m22, m17 + vpbroadcastd m11, [o(pd_4017)] + vpbroadcastd m10, [o(pd_799)] + REPX {pminsd x, m15}, m5, m6, m22, m17 + ITX_MULSUB_2D 16, 8, 9, 1, _, 13, 10, 11 ; t17a, t30a + ITX_MULSUB_2D 3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a + vpbroadcastd m11, [o(pd_2276)] + vpbroadcastd m10, [o(pd_3406)] + ITX_MULSUB_2D 18, 19, 9, 1, _, 13, 10, 11 ; t21a, t26a + ITX_MULSUB_2D 22, 5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a + paddd m1, m6, m2 ; t23a + psubd m6, m2 ; t20a + psubd m2, m17, m21 ; t27a + paddd m17, m21 ; t24a + REPX {pmaxsd x, m14}, m1, m6, m2, m17 + psubd m21, m23, m4 ; t28a + paddd m23, m4 ; t31a + REPX {pminsd x, m15}, m1, m6, m2, m17 + psubd m4, m16, m20 ; t18 + paddd m16, m20 ; t17 + REPX {pmaxsd x, m14}, m21, m23, m4, m16 + psubd m20, m0, m7 ; t19a + paddd m0, m7 ; t16a + REPX {pminsd x, m15}, m21, m23, m4, m16 + psubd m7, m8, m3 ; t29 + paddd m3, m8 ; t30 + REPX {pmaxsd x, m14}, m20, m0, m7, m3 + paddd m8, m5, m18 ; t22 + psubd m5, m18 ; t21 + REPX {pminsd x, m15}, m20, m0, m7, m3 + psubd m18, m22, m19 ; t26 + paddd m22, m19 ; t25 + REPX {pmaxsd x, m14}, m8, m5, m18, m22 + vpbroadcastd m11, [o(pd_3784)] + vpbroadcastd m10, [o(pd_1567)] + REPX {pminsd x, m15}, m8, m5, m18, m22 + ITX_MULSUB_2D 21, 20, 9, 19, _, 13, 10, 11 ; t19, t28 + ITX_MULSUB_2D 2, 6, 9, 19, _, 13, 10, 11, 2 ; t27, t20 + ITX_MULSUB_2D 7, 4, 9, 19, _, 13, 10, 11 ; t18a, t29a + ITX_MULSUB_2D 18, 5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a + psubd m19, m0, m1 ; t23 + paddd m0, m1 ; t16 + paddd m1, m8, m16 ; t17a + psubd m8, m16, m8 ; t22a + REPX {pmaxsd x, m14}, m19, m0, m1, m8 + psubd m16, m23, m17 ; t24 + paddd m23, m17 ; t31 + REPX {pminsd x, m15}, m19, m0, m1, m8 + psubd m17, m3, m22 ; t25a + paddd m22, m3 ; t30a + REPX {pmaxsd x, m14}, m16, m23, m17, m22 + paddd m3, m6, m21 ; t19a + psubd m6, m21, m6 ; t20a + REPX {pminsd x, m15}, m16, m23, m17, m22 + paddd m21, m18, m4 ; t29 + psubd m18, m4, m18 ; t26 + REPX {pmaxsd x, m14}, m3, m6, m21, m18 + psubd m4, m20, m2 ; t27a + paddd m20, m2 ; t28a + REPX {pminsd x, m15}, m3, m6, m21, m18 + paddd m2, m7, m5 ; t18 + psubd m7, m5 ; t21 + REPX {pmaxsd x, m14}, m4, m20, m2, m7 + REPX {pminsd x, m15}, m4, m20, m2, m7 + REPX {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8 + REPX {paddd x, m13}, m18, m16, m4, m17 + psubd m5, m18, m7 ; t21a + paddd m18, m7 ; t26a + psubd m7, m16, m19 ; t23a + paddd m16, m19 ; t24a + REPX {psrad x, 12 }, m5, m18, m7, m16 + paddd m19, m4, m6 ; t27 + psubd m4, m6 ; t20 + psubd m6, m17, m8 ; t22 + paddd m17, m8 ; t25 + REPX {psrad x, 12 }, m19, m4, m6, m17 + ret + +cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob +%undef cmp + vpbroadcastd m10, [pw_2896x8] + vpbroadcastd m11, [pw_1697x16] + vpbroadcastd m13, [pw_2048] + vpbroadcastd m15, [pixel_10bpc_max] + lea r6, [strideq*9] + pxor m14, m14 + cmp eobd, 151 + jl .main + mov r4, dstq + call .main + add cq, 64*12 + lea dstq, [r4+32] +.main: + call .main_internal + add cq, 64*4 + pmulhrsw m1, m13, m2 + pmulhrsw m3, m13, m4 + pmulhrsw m5, m13, m6 + pmulhrsw m7, m13, m8 + call .main_internal + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 +.main_internal: + mova m8, [cq+64* 0] + packssdw m8, [cq+64* 8] + mova m6, [cq+64* 1] + packssdw m6, [cq+64* 9] + mova m0, [cq+64* 2] + packssdw m0, [cq+64*10] + mova m2, [cq+64* 3] + packssdw m2, [cq+64*11] + REPX {pmulhrsw x, m10}, m8, m6, m0, m2 + REPX {paddsw x, x }, m8, m6, m0, m2 + REPX {vpermq x, x, q3120}, m8, m6, m0, m2 + pmulhrsw m4, m11, m8 + pmulhrsw m9, m11, m6 + paddsw m8, m8 + paddsw m6, m6 + REPX {mova [cq+64*x], m14}, 0, 1, 2, 3 + paddsw m8, m4 + paddsw m6, m9 + pmulhrsw m4, m11, m0 + pmulhrsw m9, m11, m2 + paddsw m0, m0 + paddsw m2, m2 + REPX {mova [cq+64*x], m14}, 8, 9, 10, 11 + paddsw m0, m4 + paddsw m2, m9 + punpcklwd m4, m8, m6 + punpckhwd m8, m6 + punpcklwd m6, m0, m2 + punpckhwd m0, m2 + punpckldq m2, m4, m6 ; 0 1 + punpckhdq m4, m6 ; 2 3 + punpckldq m6, m8, m0 ; 4 5 + punpckhdq m8, m0 ; 6 7 + ret + +cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + WIN64_SPILL_XMM 30 + cmp eobd, 136 + jl .fast + add cq, 64 + cmp eobd, 543 + jge .full + call .pass1_fast ; bottomright 16x16 zero + mov r6d, 16*12 + jmp .lefthalf +.full: + call .pass1 + mov r6d, 16*28 +.lefthalf: + mova [cq+128* 0], m0 + mova [cq+128* 1], m1 + mova [cq+128* 2], m2 + mova [cq+128* 3], m3 + mova [cq+128* 4], m14 + mova [cq+128* 5], m15 + mova [cq+128* 6], m16 + mova [cq+128* 7], m17 + mova [cq+128* 8], m22 + mova [cq+128* 9], m23 + mova [cq+128*10], m24 + mova [cq+128*11], m25 + mova [cq+128*12], m26 + mova [cq+128*13], m27 + mova [cq+128*14], m28 + mova [cq+128*15], m29 + sub cq, 64 + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + call .pass1 + lea r5, [o_base_8bpc] + mova m4, [cq+64+128* 0] + mova m5, [cq+64+128* 1] + mova m6, [cq+64+128* 2] + mova m7, [cq+64+128* 3] + mova m18, [cq+64+128* 4] + mova m19, [cq+64+128* 5] + mova m20, [cq+64+128* 6] + mova m21, [cq+64+128* 7] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+128*0], m14 + mova [cq+128*1], m15 + mova [cq+128*2], m16 + mova [cq+128*3], m17 + mova [cq+128*4], m18 + mova [cq+128*5], m19 + mova [cq+128*6], m20 + mova [cq+128*7], m21 + mova m14, [cq+64+128* 8] + mova m15, [cq+64+128* 9] + mova m16, [cq+64+128*10] + mova m17, [cq+64+128*11] + mova m18, [cq+64+128*12] + mova m19, [cq+64+128*13] + mova m20, [cq+64+128*14] + mova m21, [cq+64+128*15] + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf + pxor m12, m12 +.right_zero_loop: + mova [cq+r6*8+64+128*3], m12 + mova [cq+r6*8+64+128*2], m12 + mova [cq+r6*8+64+128*1], m12 + mova [cq+r6*8+64+128*0], m12 + sub r6d, 16*4 + jge .right_zero_loop + mov r6d, 16*28 + jmp .end2 +.fast: ; topleft 16x16 nonzero + cmp eobd, 36 + jl .fast2 + call .pass1_fast + lea r5, [o_base_8bpc] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova [cq+128*0], m14 + mova [cq+128*1], m15 + mova [cq+128*2], m16 + mova [cq+128*3], m17 + mova [cq+128*4], m18 + mova [cq+128*5], m19 + mova [cq+128*6], m20 + mova [cq+128*7], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + jmp .end +.fast2: ; topleft 8x8 nonzero + movshdup m7, [o(permB)] + mova ym0, [cq+128*0] + mova ym1, [cq+128*4] + mova ym4, [cq+128*2] + mova ym5, [cq+128*6] + mova ym16, [cq+128*1] + mova ym2, [cq+128*5] + mova ym3, [cq+128*3] + mova ym17, [cq+128*7] + mov r6d, 16*4 + vpermq m0, m7, m0 ; 0 0 + vpermq m1, m7, m1 ; 4 4 + vpermt2q m4, m7, m5 ; 2 6 + vpermt2q m16, m7, m2 ; 1 5 + vpermt2q m17, m7, m3 ; 7 3 + call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 + call m(idct_16x16_internal_10bpc).main_end + call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 + lea r5, [o_base_8bpc] + punpckhqdq m22, m0, m2 ; 1 + punpcklqdq m0, m2 ; 0 + punpcklqdq m1, m5, m7 ; 4 + punpckhqdq m24, m5, m7 ; 5 + punpcklqdq m14, m3, m4 ; 2 + punpckhqdq m23, m3, m4 ; 3 + punpcklqdq m15, m6, m8 ; 6 + punpckhqdq m25, m6, m8 ; 7 + mova m10, m13 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 + mova [cq+128*0], m14 + mova [cq+128*1], m15 + mova [cq+128*2], m16 + mova [cq+128*3], m17 + mova [cq+128*4], m18 + mova [cq+128*5], m19 + mova [cq+128*6], m20 + mova [cq+128*7], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 +.end: + pxor m12, m12 +.end2: + psubsw m9, m0, m29 ; out31 + paddsw m0, m29 ; out0 + psubsw m29, m1, m28 ; out30 + paddsw m1, m28 ; out1 + psubsw m28, m2, m27 ; out29 + paddsw m2, m27 ; out2 + psubsw m27, m3, m26 ; out28 + paddsw m3, m26 ; out3 + psubsw m26, m4, m25 ; out27 + paddsw m4, m25 ; out4 + psubsw m25, m5, m24 ; out26 + paddsw m5, m24 ; out5 + psubsw m24, m6, m23 ; out25 + paddsw m6, m23 ; out6 + psubsw m23, m7, m22 ; out24 + paddsw m7, m22 ; out7 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + psubsw m22, m0, m21 ; out23 + paddsw m0, m21 ; out8 + psubsw m21, m1, m20 ; out22 + paddsw m1, m20 ; out9 + psubsw m20, m2, m19 ; out21 + paddsw m2, m19 ; out10 + psubsw m19, m3, m18 ; out20 + paddsw m3, m18 ; out11 + psubsw m18, m4, m17 ; out19 + paddsw m4, m17 ; out12 + psubsw m17, m5, m16 ; out18 + paddsw m5, m16 ; out13 + psubsw m16, m6, m15 ; out17 + paddsw m6, m15 ; out14 + psubsw m15, m7, m14 ; out16 + paddsw m7, m14 ; out15 +.zero_loop: + mova [cq+r6*8+128*3], m12 + mova [cq+r6*8+128*2], m12 + mova [cq+r6*8+128*1], m12 + mova [cq+r6*8+128*0], m12 + sub r6d, 16*4 + jge .zero_loop + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8 + pmulhrsw m0, m11, m15 + pmulhrsw m1, m11, m16 + pmulhrsw m2, m11, m17 + pmulhrsw m3, m11, m18 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 + pmulhrsw m0, m11, m19 + pmulhrsw m1, m11, m20 + pmulhrsw m2, m11, m21 + pmulhrsw m3, m11, m22 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 + pmulhrsw m0, m11, m23 + pmulhrsw m1, m11, m24 + pmulhrsw m2, m11, m25 + pmulhrsw m3, m11, m26 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 + pmulhrsw m0, m11, m27 + pmulhrsw m1, m11, m28 + pmulhrsw m2, m11, m29 + pmulhrsw m3, m11, m9 + WIN64_RESTORE_XMM + vzeroupper + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2 +.pass1_fast: + mova m0, [cq+128* 0] + mova m1, [cq+128* 4] + mova m2, [cq+128* 8] + mova m3, [cq+128*12] + mov r6d, 16*12 + call m(idct_8x16_internal_10bpc).main_fast + mova m16, [cq+128* 2] + mova m17, [cq+128* 6] + mova m18, [cq+128*10] + mova m19, [cq+128*14] + call m(idct_16x16_internal_10bpc).main_fast + call .pass1_load_spill + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast + jmp .pass1_end +.pass1: + mova m0, [cq+128* 0] + mova m1, [cq+128* 4] + mova m2, [cq+128* 8] + mova m3, [cq+128*12] + mova m4, [cq+128*16] + mova m5, [cq+128*20] + mova m6, [cq+128*24] + mova m7, [cq+128*28] + call m(idct_8x16_internal_10bpc).main + mova m16, [cq+128* 2] + mova m17, [cq+128* 6] + mova m18, [cq+128*10] + mova m19, [cq+128*14] + mova m20, [cq+128*18] + mova m21, [cq+128*22] + mova m22, [cq+128*26] + mova m23, [cq+128*30] + call m(idct_16x16_internal_10bpc).main + call .pass1_load_spill + mova m16, [cq+128*17] + mova m17, [cq+128*19] + mova m18, [cq+128*21] + mova m19, [cq+128*23] + mova m20, [cq+128*25] + mova m21, [cq+128*27] + mova m22, [cq+128*29] + mova m23, [cq+128*31] + call m(inv_txfm_add_dct_dct_32x16_10bpc).main +.pass1_end: + vpbroadcastd m11, [o(pd_2)] + lea r4, [cq+128*8] + call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end + punpckhqdq m22, m0, m20 ; 1 + punpcklqdq m0, m20 ; 0 + punpckhqdq m24, m2, m1 ; 5 + punpcklqdq m1, m2, m1 ; 4 + punpcklqdq m2, m14, m18 ; 8 + punpckhqdq m26, m14, m18 ; 9 + punpcklqdq m14, m15, m4 ; 2 + punpckhqdq m23, m15, m4 ; 3 + punpckhqdq m25, m3, m21 ; 7 + punpcklqdq m15, m3, m21 ; 6 + punpckhqdq m28, m6, m17 ; 13 + punpcklqdq m3, m6, m17 ; 12 + punpckhqdq m27, m5, m16 ; 11 + punpcklqdq m16, m5, m16 ; 10 + punpckhqdq m29, m7, m8 ; 15 + punpcklqdq m17, m7, m8 ; 14 + ret +.pass1_load_spill: + call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub + mova [cq+128* 0], m0 + mova m0, [cq+128* 1] + mova [cq+128* 1], m1 + mova [cq+128* 2], m2 + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova [cq+128* 3], m3 + mova [cq+128* 4], m4 + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova [cq+128* 5], m5 + mova [cq+128* 6], m6 + mova [cq+128* 7], m7 + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + mova [cq+128* 8], m23 + mova [cq+128* 9], m22 + mova [cq+128*10], m21 + mova [cq+128*11], m20 + mova [cq+128*12], m19 + mova [cq+128*13], m18 + mova [cq+128*14], m17 + mova [cq+128*15], m16 + ret + +cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob +%undef cmp + vpbroadcastd m13, [pw_8192] + vpbroadcastd m15, [pixel_10bpc_max] + pxor m14, m14 + lea r6, [strideq*9] + cmp eobd, 136 + jl .main + mov r4, dstq + call .main + add cq, 64-128*4 + lea dstq, [dstq+strideq*8] + call .main + add cq, 128*12-64 + lea dstq, [r4+32] + cmp eobd, 543 + jl .main + call .main + add cq, 64-128*4 + lea dstq, [dstq+strideq*8] +.main: + call .main_internal + add cq, 128*4 + pmulhrsw m1, m13, m2 + pmulhrsw m3, m13, m4 + pmulhrsw m5, m13, m6 + pmulhrsw m7, m13, m8 + call .main_internal + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 +.main_internal: + mova m8, [cq+128* 0] + packssdw m8, [cq+128* 8] + mova m6, [cq+128* 1] + packssdw m6, [cq+128* 9] + mova m0, [cq+128* 2] + packssdw m0, [cq+128*10] + mova m2, [cq+128* 3] + packssdw m2, [cq+128*11] + REPX {vpermq x, x, q3120}, m8, m6, m0, m2 + REPX {mova [cq+128*x], m14}, 0, 1, 2, 3 + punpcklwd m4, m8, m6 + punpckhwd m8, m6 + punpcklwd m6, m0, m2 + punpckhwd m0, m2 + REPX {mova [cq+128*x], m14}, 8, 9, 10, 11 + punpckldq m2, m4, m6 ; 0 1 + punpckhdq m4, m6 ; 2 3 + punpckldq m6, m8, m0 ; 4 5 + punpckhdq m8, m0 ; 6 7 + ret + +cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + + PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + cmp eobd, 36 + jl .fast + call .pass1 + cmp eobd, 151 + jge .full + lea r5, [o_base_8bpc] + + punpckhwd m22, m0, m0 + punpckhwd m23, m1, m1 + punpckhwd m24, m2, m2 + punpckhwd m25, m3, m3 + punpckhwd m26, m4, m4 + punpckhwd m27, m5, m5 + punpckhwd m28, m6, m6 + punpckhwd m29, m7, m7 + punpcklwd m21, m1, m1 + punpcklwd m14, m3, m3 + punpcklwd m18, m5, m5 + punpcklwd m15, m7, m7 + pxor m9, m9 + punpcklwd m9, m9, m0 + punpcklwd m8, m2, m2 + punpcklwd m7, m4, m4 + punpcklwd m1, m6, m6 + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + mova [rsp+mmsize*0], m14 + mova [rsp+mmsize*1], m15 + mova [rsp+mmsize*2], m16 + mova [rsp+mmsize*3], m17 + mova [rsp+mmsize*4], m18 + mova [rsp+mmsize*5], m19 + mova [rsp+mmsize*6], m20 + mova [rsp+mmsize*7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + + pxor m12, m12 + mov r3d, 64*3 +.zero_loop: + REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3 + sub r3d, 64 + jge .zero_loop + + jmp .pass2_end +.full: + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 64 + call .pass1 + sub cq, 64 + mova m22, [cq+128*0] ; 0 1 + mova m23, [cq+128*1] ; 2 3 + mova m24, [cq+128*2] ; 4 5 + mova m25, [cq+128*3] ; 6 7 + mova m26, [cq+128*4] ; 8 9 + mova m27, [cq+128*5] ; 10 11 + mova m28, [cq+128*6] ; 12 13 + mova m29, [cq+128*7] ; 14 15 + mova [cq+64* 8], m0 + mova [cq+64* 9], m1 + mova [cq+64*10], m2 + mova [cq+64*11], m3 + mova [cq+64*12], m4 + mova [cq+64*13], m5 + mova [cq+64*14], m6 + mova [cq+64*15], m7 + lea r5, [o_base_8bpc] + + punpcklwd m20, m1, m1 + punpcklwd m16, m3, m3 + punpcklwd m19, m5, m5 + punpcklwd m17, m7, m7 + punpcklwd m8, m24, m24 ; 4 + punpcklwd m5, m2, m2 ; 20 + punpcklwd m1, m28, m28 ; 12 + punpcklwd m7, m26, m26 ; 8 + punpcklwd m3, m4, m4 ; 24 + punpcklwd m4, m6, m6 ; 28 + pxor m9, m9 + punpcklwd m6, m9, m0 ; __ 16 + mova m0, m4 + punpcklwd m9, m9, m22 ; __ 0 + call m(idct_16x16_internal_8bpc).main_fast + punpcklwd m21, m23, m23 ; 2 + punpcklwd m15, m29, m29 ; 14 + punpcklwd m18, m27, m27 ; 10 + punpcklwd m14, m25, m25 ; 6 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova [rsp+mmsize*0], m14 + mova [rsp+mmsize*1], m15 + mova [rsp+mmsize*2], m16 + mova [rsp+mmsize*3], m17 + mova [rsp+mmsize*4], m18 + mova [rsp+mmsize*5], m19 + mova [rsp+mmsize*6], m20 + mova [rsp+mmsize*7], m21 + mova m21, [cq+64*15] + mova m14, [cq+64* 8] + mova m17, [cq+64*11] + mova m18, [cq+64*12] + mova m19, [cq+64*13] + mova m16, [cq+64*10] + mova m15, [cq+64* 9] + mova m20, [cq+64*14] + REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ + m24, m19, m16, m27, m28, m15, m20, m23 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf + + pxor m12, m12 + mov r3d, 32*7 +.full_zero_loop: + REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3 + sub r3d, 32 + jge .full_zero_loop + + jmp .pass2_end +.fast: + mova ym0, [cq+128*0] + mova ym2, [cq+128*4] + movshdup m8, [o(permB)] + mova ym1, [cq+128*2] + mova ym3, [cq+128*6] + mova ym4, [cq+128*1] + mova ym5, [cq+128*3] + mova ym6, [cq+128*5] + mova ym7, [cq+128*7] + vpermt2q m0, m8, m2 ; 0 4 + vpermt2q m1, m8, m3 ; 2 6 + vpermt2q m4, m8, m5 ; 1 3 + vpermt2q m7, m8, m6 ; 7 5 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + vpbroadcastd m11, [o(pd_2)] + call m(idct_8x16_internal_10bpc).main_end2 + mova m8, [o(idct8x32p)] + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m6, [dup16_perm] + vpermb m0, m8, m0 + vpermb m2, m8, m2 + vprold m8, 16 + vpermb m1, m8, m1 + vpermb m3, m8, m3 + punpckldq m4, m0, m2 + punpckhdq m0, m2 + punpckldq m2, m1, m3 + punpckhdq m1, m3 + punpckldq m21, m4, m2 + punpckhdq m14, m4, m2 + punpckldq m18, m0, m1 + punpckhdq m15, m0, m1 + vpord m7, m6, [o(pb_32)] {1to16} + vpermb m22, m7, m21 ; 1 + pmovzxwd m9, ym21 ; 0 + vpermb m8, m6, m18 ; 4 + vpermb m24, m7, m18 ; 5 + vpermb m21, m6, m14 ; 2 + vpermb m23, m7, m14 ; 3 + vpermb m14, m6, m15 ; 6 + vpermb m25, m7, m15 ; 7 + lea r5, [o_base_8bpc] + pslld m9, 16 + + pxor m7, m7 + REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29 + + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + mova [rsp+mmsize*0], m14 + mova [rsp+mmsize*1], m15 + mova [rsp+mmsize*2], m16 + mova [rsp+mmsize*3], m17 + mova [rsp+mmsize*4], m18 + mova [rsp+mmsize*5], m19 + mova [rsp+mmsize*6], m20 + mova [rsp+mmsize*7], m21 + + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + + pxor m12, m12 + REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7 +.pass2_end: + movshdup m30, [permC] + vpbroadcastd m11, [pw_2048] + vpbroadcastd m13, [pixel_10bpc_max] + lea r6, [strideq*3] + psrlq m31, m30, 8 + vpermq m8, m30, m0 + vpermq m9, m31, m1 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m2 + vpermq m9, m31, m3 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m4 + vpermq m9, m31, m5 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m6 + vpermq m9, m31, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + + mova m1, [rsp+mmsize*0] + mova m2, [rsp+mmsize*1] + mova m3, [rsp+mmsize*2] + mova m4, [rsp+mmsize*3] + mova m5, [rsp+mmsize*4] + mova m6, [rsp+mmsize*5] + mova m7, [rsp+mmsize*6] + mova m8, [rsp+mmsize*7] + + paddsw m0, m1, m21 + psubsw m21, m1, m21 + paddsw m1, m2, m20 + psubsw m20, m2, m20 + paddsw m2, m3, m19 + psubsw m19, m3, m19 + paddsw m3, m4, m18 + psubsw m18, m4, m18 + paddsw m4, m5, m17 + psubsw m17, m5, m17 + paddsw m5, m6, m16 + psubsw m16, m6, m16 + paddsw m6, m7, m15 + psubsw m15, m7, m15 + paddsw m7, m8, m14 + psubsw m14, m8, m14 + + vpermq m8, m30, m0 + vpermq m9, m31, m1 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m2 + vpermq m9, m31, m3 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m4 + vpermq m9, m31, m5 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m6 + vpermq m9, m31, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + + vpermq m8, m30, m14 + vpermq m9, m31, m15 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m16 + vpermq m9, m31, m17 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m18 + vpermq m9, m31, m19 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m20 + vpermq m9, m31, m21 + call m(idct_16x8_internal_10bpc).write_16x4 + + vpermq m8, m30, m22 + vpermq m9, m31, m23 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m24 + vpermq m9, m31, m25 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m26 + vpermq m9, m31, m27 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m28 + vpermq m9, m31, m29 + call m(idct_16x8_internal_10bpc).write_16x4 + RET +.pass1: + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(idct_8x16_internal_10bpc).main + mova m16, [cq+128* 1] + mova m17, [cq+128* 3] + mova m18, [cq+128* 5] + mova m19, [cq+128* 7] + mova m20, [cq+128* 9] + mova m21, [cq+128*11] + mova m22, [cq+128*13] + mova m23, [cq+128*15] + call m(idct_16x16_internal_10bpc).main + call m(idct_16x16_internal_10bpc).main_end + jmp m(idct_16x16_internal_10bpc).main_end3 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 64 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/itx16_sse.asm b/third_party/dav1d/src/x86/itx16_sse.asm new file mode 100644 index 0000000000..3833e17c99 --- /dev/null +++ b/third_party/dav1d/src/x86/itx16_sse.asm @@ -0,0 +1,8135 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright © 2017-2021, The rav1e contributors +; Copyright © 2020, Nathan Egge +; Copyright © 2021, Matthias Dressel +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA +%macro COEF 1-2 +pd_%1: times 4 dd %1 +%if %0 == 2 +pd_m%1: times 4 dd -%1 +%endif +%endmacro + +COEF 201 +COEF 401 +COEF 601, 1 +COEF 799 +COEF 995 +COEF 1189, 1 +COEF 1380, 1 +COEF 1567 +COEF 1751 +COEF 1931 +COEF 2106, 1 +COEF 2276, 1 +COEF 2440 +COEF 2598, 1 +COEF 2751, 1 +COEF 2896 +COEF 3035 +COEF 3166 +COEF 3290 +COEF 3406 +COEF 3513 +COEF 3612 +COEF 3703 +COEF 3784 +COEF 3857 +COEF 3920 +COEF 3973 +COEF 4017 +COEF 4052 +COEF 4076 +COEF 4091 + +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +%if ARCH_X86_32 +pd_1: times 4 dd 1 +%endif +pd_2: times 4 dd 2 +pw_5: times 8 dw 5 +pd_1321: times 4 dd 1321 +pd_2482: times 4 dd 2482 +pd_m3344: times 4 dd -3344 +pd_2048: times 4 dd 2048 +pw_4x2048_4xm2048: times 4 dw 2048 + times 4 dw -2048 +pw_4xm2048_4x2048: times 4 dw -2048 + times 4 dw 2048 +pw_2048: times 8 dw 2048 +pw_m2048: times 8 dw -2048 +pd_3803: times 4 dd 3803 +pw_4096: times 8 dw 4096 +pd_5793: times 4 dd 5793 +pd_6144: times 4 dd 6144 +pw_8192: times 8 dw 8192 +pd_10240: times 4 dd 10240 +pd_11586: times 4 dd 11586 +pw_1697x8: times 8 dw 1697*8 +pw_2896x8: times 8 dw 2896*8 +pw_1697x16: times 8 dw 1697*16 +pw_16384: times 8 dw 16384 +pixel_10bpc_max: times 8 dw 0x03ff + +pw_1567_3784: times 4 dw 1567, 3784 +pw_m3784_1567: times 4 dw -3784, 1567 +pw_2896_2896: times 4 dw 2896, 2896 +pw_m2896_2896: times 4 dw -2896, 2896 + +clip_18b_min: times 4 dd -0x20000 +clip_18b_max: times 4 dd 0x1ffff + +idct64_mul_16bpc: +dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 +dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 +dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 +dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 + +cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 +cextern iadst_4x4_internal_8bpc_ssse3.main +cextern idct_4x8_internal_8bpc_ssse3.main +cextern iadst_4x8_internal_8bpc_ssse3.main +cextern idct_16x4_internal_8bpc_ssse3.main +cextern iadst_16x4_internal_8bpc_ssse3.main +cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end +cextern idct_8x4_internal_8bpc_ssse3.main +cextern iadst_8x4_internal_8bpc_ssse3.main +cextern idct_8x8_internal_8bpc_ssse3.main +cextern idct_8x8_internal_8bpc_ssse3.pass1_end3 +cextern iadst_8x8_internal_8bpc_ssse3.main +cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end +cextern idct_16x8_internal_8bpc_ssse3.main +cextern iadst_16x8_internal_8bpc_ssse3.main +cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end +cextern idct_8x32_internal_8bpc_ssse3.main +cextern idct_8x32_internal_8bpc_ssse3.main_fast +cextern idct_8x32_internal_8bpc_ssse3.main_veryfast +cextern idct_16x64_internal_8bpc_ssse3.main +cextern idct_16x64_internal_8bpc_ssse3.main_fast + +tbl_4x16_2d: db 0, 13, 29, 45 +tbl_4x16_h: db 0, 16, 32, 48 +tbl_4x16_v: db 0, 4, 8, 12 + +tbl_8x16_2d: db 0, 14, 30, 46 +tbl_8x16_v: db 0, 4, 8, 12 +tbl_8x16_h: db 0, 32, 64, 96 + +tbl_16x16_2d: db 0, 10, 36, 78 +tbl_16x16_v: db 0, 4, 8, 12 +tbl_16x16_h: db 0, 64, 128, 192 + +tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203 + +tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343 + +tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one +tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406 + +tbl_Nx32_odd_offset: db 2*16, 2*23 + db 2*20, 2*19 + db 2*18, 2*21 + db 2*22, 2*17 + db 2*30, 2*25 + db 2*26, 2*29 + db 2*28, 2*27 + db 2*24, 2*31 + +tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46 + db 2* 8, 2*40, 2*23, 2*38 + db 2* 1, 2*36, 2*20, 2*42 + db 2* 9, 2*44, 2*19, 2*34 + db 2* 2, 2*60, 2*18, 2*50 + db 2*10, 2*52, 2*21, 2*58 + db 2* 3, 2*56, 2*22, 2*54 + db 2*11, 2*48, 2*17, 2*62 + +SECTION .text + +%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) +%define m(x) m_suffix(x, SUFFIX) + +; This refers to the first function in itx_sse i.e. the start of the text section +; which is needed as a base pointer for constants. +%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3) + +%if ARCH_X86_64 +%define o(x) x +%else +%define o(x) r6-$$+x ; PIC +%endif + +%macro IWHT4_1D 0 + ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 + paddd m0, m1 ; in0 += in1 + psubd m4, m2, m3 ; tmp0 = in2 - in3 + psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 + psrad m5, 1 + psubd m2, m5, m1 ; in2 = tmp1 - in1 + psubd m5, m3 ; in1 = tmp1 - in3 + psubd m0, m5 ; in0 -= in1 + paddd m4, m2 ; in3 = tmp0 + in2 + ; m0 = out0, m1 = in1, m2 = out2, m3 = in3 + ; m4 = out3, m5 = out1 +%endmacro + +INIT_XMM sse2 +cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax + mova m0, [cq+16*0] + mova m1, [cq+16*1] + mova m2, [cq+16*2] + mova m3, [cq+16*3] + REPX {psrad x, 2}, m0, m1, m2, m3 + IWHT4_1D + punpckldq m1, m0, m5 + punpckhdq m3, m0, m5 + punpckldq m5, m2, m4 + punpckhdq m2, m4 + punpcklqdq m0, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m4, m3, m2 + punpckhqdq m3, m2 + mova m2, m4 + IWHT4_1D + packssdw m0, m4 ; low: out3, high: out0 + packssdw m2, m5 ; low: out2, high: out1 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + lea r2, [dstq+strideq*2] + movq m1, [dstq+strideq*0] + movhps m1, [r2 +strideq*1] + movq m3, [r2 +strideq*0] + movhps m3, [dstq+strideq*1] + movd m5, bdmaxm + pshuflw m5, m5, q0000 ; broadcast + punpcklqdq m5, m5 ; broadcast + paddsw m0, m1 + paddsw m2, m3 + pmaxsw m0, m4 + pmaxsw m2, m4 + pminsw m0, m5 + pminsw m2, m5 + movhps [r2 +strideq*1], m0 ; write out0 + movhps [dstq+strideq*1], m2 ; write out1 + movq [r2 +strideq*0], m2 ; write out2 + movq [dstq+strideq*0], m0 ; write out3 + RET + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 2 = inv_dst1, 4 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +; %1 dst/src[1] +; %2 dst/src[2] +; %3 tmp[1] +; %4 tmp[2] +; %5 tmp[3] +; %6 rnd +; %7 coef[1] +; %8 coef[2] +; %9 flags +%ifnidn %7,%8 ; optimize when coef1 == coef2 +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else + mova m%3, [o(pd_%8)] + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else + mova m%5, [o(pd_%7)] + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 4 ; invert dst2 + paddd m%4, m%2 + psubd m%2, m%6, m%4 +%else +%ifnum %6 +%ifnidn %7,%8 + paddd m%4, m%6 +%else + paddd m%1, m%6 +%endif +%endif +%ifnidn %7,%8 + paddd m%2, m%4 +%else + mova m%3, m%2 + paddd m%2, m%1 +%endif +%endif +%if %9 & 2 ; invert dst1 + psubd m%3, m%1 + paddd m%1, m%3, m%6 +%else +%ifnum %6 +%ifnidn %7,%8 + paddd m%1, m%6 +%endif +%endif + psubd m%1, m%3 +%endif +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack +cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_16bpc) +%if ARCH_X86_32 + LEA r6, $$ +%endif +%if has_epilogue +%ifidn %1_%2, dct_dct + test eobd, eobd + jz %%end +%endif + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifnum %3 +%if %3 + add eobd, %3 +%endif +%else + lea r5, [o(%3)] +%endif + call %%p1 + RET +%%end: +%else + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifnum %3 +%if %3 + add eobd, %3 +%endif +%else + lea r5, [o(%3)] +%endif +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x4 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 4 +.dconly: + add r5d, 128 + sar r5d, 8 +.dconly2: + imul r5d, 2896 + mova m2, [o(pixel_10bpc_max)] + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + pxor m3, m3 + punpcklqdq m0, m0 +.dconly_loop: + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + paddw m1, m0 + pminsw m1, m2 + pmaxsw m1, m3 + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd + ; butterfly rotation + ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0 + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3 + ; Hadamard rotation + psubd m%5, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%3, m%4 + psubd m%3, m%4 + ; %1 (src1) = out0 + ; %2 (src2) = out1 + ; %3 (src3) = out3 + ; $5 (tmp1) = out2 +%endmacro + +INIT_XMM sse4 + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, identity +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst + +cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + mova m2, [cq+16*2] + mova m3, [cq+16*3] + mova m5, [o(pd_2048)] + call .pass1_main + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + ; transpose + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass1_main: + IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 + ret +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m5 = pd_2048 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + pmaddwd m4, m2, [o(pw_m3784_1567)] + pmaddwd m2, [o(pw_1567_3784)] + pmaddwd m0, m1, [o(pw_m2896_2896)] + pmaddwd m1, [o(pw_2896_2896)] + REPX {paddd x, m5}, m4, m2, m0, m1 + packssdw m5, m5 ; pw_2048 + REPX {psrad x, 12}, m4, m2, m0, m1 + packssdw m2, m4 ; t3 t2 + packssdw m1, m0 ; t0 t1 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + movq m3, [r5 +strideq*1] + movhps m3, [r5 +strideq*0] + mova m5, [o(pixel_10bpc_max)] + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movhps [r5 +strideq*0], m1 + movq [r5 +strideq*1], m1 + RET + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .main + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + ; transpose + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main +.end: + mova m4, [o(pw_2048)] + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + movq m3, [r5 +strideq*0] + movhps m3, [r5 +strideq*1] + mova m5, [o(pixel_10bpc_max)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [r5 +strideq*0], m1 + movhps [r5 +strideq*1], m1 + RET +ALIGN function_align +.main: + mova m1, [cq+16*2] + mova m3, [cq+16*3] + mova m5, [cq+16*0] + lea r3, [cq+16*1] +.main2: + mova m0, [o(pd_1321)] ; SINPI_1_9 + mova m2, [o(pd_2482)] ; SINPI_2_9 + mova m6, [o(pd_3803)] ; SINPI_4_9 + pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2] + pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3] + pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2] + pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0] + psubd m1, m3 ; T[2] - T[3] + pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3] + pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0] + paddd m0, m6 ; s[0] += s[3] + paddd m0, m3 ; s[0] += s[5] + mova m3, [o(pd_m3344)] ; -SINPI_3_9 + psubd m2, m4 ; s[1] -= s[4] + psubd m2, m7 ; s[1] -= s[6] + psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0] + pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7 + pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048 + paddd m4, m0, m2 ; x[3] = s[0] + s[1] + psubd m2, m3 ; x[1] = s[1] + s[3] + psubd m0, m3 ; x[0] = s[0] + s[3] + paddd m4, m3 ; x[3] -= s[3] + paddd m2, m5 ; x[1] + 2048 + REPX {psrad x, 12}, m0, m2, m1, m4 + ret + + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_16bpc).main + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + ; transpose + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main + mova m4, [o(pw_2048)] + movq m3, [dstq+strideq*1] + movhps m3, [dstq+strideq*0] + lea r5, [dstq+strideq*2] + movq m2, [r5 +strideq*1] + movhps m2, [r5 +strideq*0] + mova m5, [o(pixel_10bpc_max)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movhps [dstq+strideq*0], m1 + movq [dstq+strideq*1], m1 + movhps [r5 +strideq*0], m0 + movq [r5 +strideq*1], m0 + RET + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m3, [o(pd_5793)] + pmulld m0, m3, [cq+16*0] + pmulld m1, m3, [cq+16*1] + pmulld m2, m3, [cq+16*2] + pmulld m3, [cq+16*3] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + ; transpose + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m5 = pd_2048 + mova m4, [o(pw_1697x8)] + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + pmulhrsw m3, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m3 + paddsw m1, m4 + movq m3, [r5 +strideq*0] + movhps m3, [r5 +strideq*1] + mova m4, [o(pixel_10bpc_max)] + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + pxor m5, m5 + mova [cq+16*0], m5 + mova [cq+16*1], m5 + mova [cq+16*2], m5 + mova [cq+16*3], m5 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m5 + pmaxsw m1, m5 + pminsw m0, m4 + pminsw m1, m4 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [r5 +strideq*0], m1 + movhps [r5 +strideq*1], m1 + RET + +%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 4x8 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 8 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly +%endif +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity, 9 +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp + mova m5, [o(pd_2048)] +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 13 + setge r5b +%else + mov r5d, 1 + cmp eobd, 13 + sbb r5d, 0 +%endif + shl r5d, 4 +.loop_pass1: + mova m3, [o(pd_2896)] + pmulld m0, m3, [cq+32*0+r5] + pmulld m1, m3, [cq+32*1+r5] + pmulld m2, m3, [cq+32*2+r5] + pmulld m3, [cq+32*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + call m(idct_4x4_internal_16bpc).pass1_main + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*0+16], m0 + mova [cq+32*1+16], m4 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova m2, [cq+32*0+16] + mova m6, [cq+32*1+16] + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_4x8_internal_8bpc, _ssse3).main + ; m0-3 is now out0/1,3/2,4/5,7/6 + mova m4, [o(pw_2048)] + shufps m1, m1, q1032 + shufps m3, m3, q1032 +.end: + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + mova m7, [o(pixel_10bpc_max)] + lea r2, [strideq*3] + movq m5, [dstq+strideq*0] + movq m6, [dstq+strideq*2] + movhps m5, [dstq+strideq*1] + movhps m6, [dstq+r2] + lea r4, [dstq+strideq*4] + paddw m0, m5 + paddw m1, m6 + movq m5, [r4+strideq*0] + movq m6, [r4+strideq*2] + movhps m5, [r4+strideq*1] + movhps m6, [r4+r2] + paddw m2, m5 + paddw m3, m6 + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r2 ], m1 + movq [r4 +strideq*0], m2 + movhps [r4 +strideq*1], m2 + movq [r4 +strideq*2], m3 + movhps [r4 +r2 ], m3 + RET + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity, 9 + +cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .pass1_main + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova m2, [cq+32*2+16] + mova m6, [cq+32*3+16] + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass1_main: +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 13 + setge r5b +%else + mov r5d, 1 + cmp eobd, 13 + sbb r5d, 0 +%endif + shl r5d, 4 + lea r3, [cq+32*1+16] +.loop_pass1: + mova m0, [o(pd_2048)] + mova m3, [o(pd_2896)] + pmulld m5, m3, [cq+32*0+r5] + pmulld m2, m3, [cq+32*1+r5] + pmulld m1, m3, [cq+32*2+r5] + pmulld m3, [cq+32*3+r5] + REPX {paddd x, m0}, m5, m2, m1, m3 + REPX {psrad x, 12}, m5, m2, m1, m3 + mova [r3], m2 + call m(iadst_4x4_internal_16bpc).main2 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*2+16], m0 + mova [cq+32*3+16], m1 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + ret +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main + mova m4, [o(pw_4x2048_4xm2048)] + jmp m(idct_4x8_internal_16bpc).end + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity, 9 + +cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_16bpc).pass1_main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + mova m6, [cq+32*2+16] + mova m2, [cq+32*3+16] + punpcklwd m4, m2, m6 + punpckhwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main + mova m4, m0 + mova m5, m1 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + pshufd m2, m5, q1032 + pshufd m3, m4, q1032 + mova m4, [o(pw_4xm2048_4x2048)] + jmp m(idct_4x8_internal_16bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity, 3 + +cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp + mova m5, [o(pd_2048)] + mova m4, [o(pd_2896)] + mova m6, [o(pd_5793)] + ; clear m7 in case we skip the bottom square + pxor m7, m7 +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 16 + setge r5b +%else + mov r5d, 1 + cmp eobd, 16 + sbb r5d, 0 +%endif + shl r5d, 4 +.loop_pass1: + pmulld m0, m4, [cq+32*0+r5] + pmulld m1, m4, [cq+32*1+r5] + pmulld m2, m4, [cq+32*2+r5] + pmulld m3, m4, [cq+32*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*0+16], m0 + mova m7, m2 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + punpckhwd m4, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m4 + punpcklwd m0, m4 + mova m2, [cq+32*0+16] + punpckhwd m4, m2, m7 + punpcklwd m2, m7 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: + mova m4, [o(pw_4096)] + jmp m(idct_4x8_internal_16bpc).end + +%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix + INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 16 + add r5d, 384 + sar r5d, 9 + jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2 +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity, v +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif + mova m5, [o(pd_2048)] +.loop_pass1: + mova m0, [cq+64*0+r5] + mova m1, [cq+64*1+r5] + mova m2, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(idct_4x4_internal_16bpc).pass1_main + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m1, m4, m2 + REPX {psrad x, 1}, m0, m1, m4, m2 + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + test r5d, r5d + jz .end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.end_pass1: + mova m2, [cq+64*0+16] + mova m3, [cq+64*1+16] + mova m4, [cq+64*0+32] + mova m5, [cq+64*1+32] + mova m6, [cq+64*0+48] + mova m7, [cq+64*1+48] + ; m0-7 = packed & transposed output + jmp tx2q +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_16x4_internal_8bpc, _ssse3).main + ; m0-6 is out0-13 [with odd registers having inversed output] + ; [coeffq+16*7] has out15/14 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [cq+16*7] + REPX {shufps x, x, q1032}, m1, m3, m5, m7 + mova [cq+16*0], m4 + mova [cq+16*1], m5 + mova [cq+16*2], m6 + mova [cq+16*3], m7 +.end: + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + mova m7, [o(pixel_10bpc_max)] + mov r5d, 2 + lea r3, [strideq*3] +.loop: + movq m5, [dstq+strideq*0] + movq m6, [dstq+strideq*2] + movhps m5, [dstq+strideq*1] + movhps m6, [dstq+r3] + lea r4, [dstq+strideq*4] + paddw m0, m5 + paddw m1, m6 + movq m5, [r4+strideq*0] + movq m6, [r4+strideq*2] + movhps m5, [r4+strideq*1] + movhps m6, [r4+r3] + paddw m2, m5 + paddw m3, m6 + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r3 ], m1 + movq [r4 +strideq*0], m2 + movhps [r4 +strideq*1], m2 + movq [r4 +strideq*2], m3 + movhps [r4 +r3 ], m3 + dec r5d + jz .end2 + lea dstq, [dstq+strideq*8] + mova m0, [cq+0*16] + mova m1, [cq+1*16] + mova m2, [cq+2*16] + mova m3, [cq+3*16] + REPX {mova [cq+x*16], m4}, 0, 1, 2, 3 + jmp .loop +.end2: + RET + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity, v + +cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r6+r5] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif +.loop_pass1: + mova m5, [cq+64*0+r5] + lea r3, [cq+64*1+r5] + mova m1, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(iadst_4x4_internal_16bpc).main2 + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m2, m1, m4 + REPX {psrad x, 1}, m0, m2, m1, m4 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end + ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8 + ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13 + mova m1, [o(pw_4x2048_4xm2048)] + REPX {pmulhrsw x, m1}, m7, m2, m0 + pshufd m6, m1, q1032 ; 4x-2048,4x2048 + pmulhrsw m1, [cq+16*7] + REPX {pmulhrsw x, m6}, m5, m4, m3 + pmulhrsw m6, [cq+16*6] + ; m7/5/2/4 = out4/11,5/10,6/9,7/8 + ; m0/3/6/1 = out0/15,3/12,1/14,2/13 + ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 + movhps [cq+0*8], m4 + movhps [cq+1*8], m2 + movhps [cq+2*8], m5 + movhps [cq+3*8], m7 + movhps [cq+4*8], m3 + movhps [cq+5*8], m1 + movhps [cq+6*8], m6 + movhps [cq+7*8], m0 + punpcklqdq m0, m6 + punpcklqdq m1, m3 + punpcklqdq m3, m2, m4 + punpcklqdq m2, m7, m5 + jmp m(idct_4x16_internal_16bpc).end + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity, v + +cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif +.loop_pass1: + mova m5, [cq+64*0+r5] + lea r3, [cq+64*1+r5] + mova m1, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(iadst_4x4_internal_16bpc).main2 + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m2, m1, m4 + REPX {psrad x, 1}, m0, m2, m1, m4 + packssdw m0, m2 ; out3 out2 + packssdw m1, m4 ; out1 out0 + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end + ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7 + ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2 + mova m1, [o(pw_4x2048_4xm2048)] + REPX {pmulhrsw x, m1}, m7, m2, m0 + pshufd m6, m1, q1032 ; 4x-2048,4x2048 + pmulhrsw m1, [cq+16*7] + REPX {pmulhrsw x, m6}, m5, m4, m3 + pmulhrsw m6, [cq+16*6] + ; m7/5/2/4 = out11/4,10/5,9/6,8/7 + ; m0/3/6/1 = out15/0,12/3,14/1,13/2 + ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 + movq [cq+0*8], m4 + movq [cq+1*8], m2 + movq [cq+2*8], m5 + movq [cq+3*8], m7 + movq [cq+4*8], m3 + movq [cq+5*8], m1 + movq [cq+6*8], m6 + movq [cq+7*8], m0 + punpckhqdq m0, m6 + punpckhqdq m1, m3 + punpckhqdq m3, m2, m4 + punpckhqdq m2, m7, m5 + jmp m(idct_4x16_internal_16bpc).end + +INV_TXFM_4X16_FN identity, dct, h +INV_TXFM_4X16_FN identity, adst, h +INV_TXFM_4X16_FN identity, flipadst, h +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif + mova m5, [o(pd_6144)] + mova m4, [o(pd_5793)] +.loop_pass1: + pmulld m0, m4, [cq+64*0+r5] + pmulld m1, m4, [cq+64*1+r5] + pmulld m2, m4, [cq+64*2+r5] + pmulld m3, m4, [cq+64*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: + mova [cq+16*4], m0 + mova [cq+16*5], m1 + mova [cq+16*6], m2 + mova [cq+16*7], m7 + mova m0, [o(pw_1697x16)] + mova m7, [o(pw_2048)] + pmulhrsw m1, m0, m4 + pmulhrsw m2, m0, m5 + REPX {paddsw x, x}, m4, m5 + paddsw m4, m1 + paddsw m5, m2 + REPX {pmulhrsw x, m7}, m4, m5 + mova [cq+16*0], m4 + mova [cq+16*1], m5 + mova m4, [cq+16*7] + pmulhrsw m1, m0, m6 + pmulhrsw m2, m0, m4 + REPX {paddsw x, x}, m6, m4 + paddsw m6, m1 + paddsw m4, m2 + REPX {pmulhrsw x, m7}, m6, m4 + mova [cq+16*2], m6 + mova [cq+16*3], m4 + mova m4, [cq+16*4] + mova m1, [cq+16*5] + mova m2, [cq+16*6] + pmulhrsw m5, m0, m2 + pmulhrsw m6, m0, m3 + REPX {paddsw x, x}, m2, m3 + paddsw m2, m5 + paddsw m3, m6 + pmulhrsw m6, m0, m1 + pmulhrsw m0, m4 + REPX {paddsw x, x}, m1, m4 + paddsw m1, m6 + paddsw m0, m4 + REPX {pmulhrsw x, m7}, m2, m3, m1, m0 + jmp m(idct_4x16_internal_16bpc).end + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, 0, 8x4, 15 +%else + INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 128 + sar r5d, 8 + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 + lea r2, [strideq*3] + mova m1, [dstq+strideq*0] + mova m2, [dstq+strideq*1] + mova m3, [dstq+strideq*2] + mova m4, [dstq+r2] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + mova [dstq+strideq*2], m3 + mova [dstq+r2 ], m4 + RET +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, identity +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst + +cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + lea r5, [o(.main)] +.pass1_entry: +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%else + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+0*16] + mova m1, [cq+1*16] + mova m2, [cq+2*16] + mova m3, [cq+3*16] + mova m4, [cq+4*16] + mova m5, [cq+5*16] + mova m6, [cq+6*16] + mova m7, [cq+7*16] + call .rect2_mul + call r5 + call .transpose4x8packed + ; m0-3 = packed & transposed output + jmp tx2q +.transpose4x8packed: + ; transpose + punpcklwd m1, m2, m6 + punpckhwd m2, m6 + punpckhwd m6, m0, m4 + punpcklwd m0, m4 + + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m4, m6, m2 + punpcklwd m6, m2 + + punpcklwd m2, m3, m4 + punpckhwd m3, m4 + punpckhwd m1, m0, m6 + punpcklwd m0, m6 + ret +.main: + call .main_pass1 + call .round + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + ret +.rect2_mul: +%if ARCH_X86_64 + REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [r3], m7 + mova m7, [o(pd_2896)] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulld m7, [r3] + mova [r3], m7 + mova m7, [o(pd_2048)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3] +%endif + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +%if ARCH_X86_64 +.main_pass1_fast: + pmulld m5, m3, [o(pd_m2276)] + pmulld m3, [o(pd_3406)] + pmulld m7, m1, [o(pd_4017)] + pmulld m1, [o(pd_799)] + pmulld m6, m2, [o(pd_3784)] + pmulld m2, [o(pd_1567)] + pmulld m0, m14 + pxor m4, m4 + jmp .main_pass1_fast2 +.main_pass1: + ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 + REPX {pmulld x, m14}, m0, m4 +.main_pass1_fast2: + REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7 + REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m14}, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + ret +.round: + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 +%else +.main_pass1_fast: + pmulld m5, m3, [o(pd_m2276)] + pmulld m3, [o(pd_3406)] + pmulld m7, m1, [o(pd_4017)] + pmulld m1, [o(pd_799)] + pmulld m6, m2, [o(pd_3784)] + pmulld m2, [o(pd_1567)] + mova m4, [o(pd_2048)] + mova [r3+0*16], m2 + REPX {paddd x, m4}, m5, m3, m7, m1 + REPX {psrad x, 12}, m5, m3, m7, m1 + paddd m2, m1, m5 ; t4 + psubd m1, m5 ; t5a + pmulld m5, m0, [o(pd_2896)] + mova m0, m4 + paddd m4, m7, m3 ; t7 + psubd m7, m3 ; t6a + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3 }, m1, m2, m7, m4 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3 }, m1, m2, m7, m4 + mova [r3+3*16], m2 + mova [r3+1*16], m4 + pxor m4, m4 + mova m2, [r3+0*16] + mova m3, [o(pd_2896)] + jmp .main_pass1_fast2 +.main_pass1: + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m4 + mova [r3+3*16], m6 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a + paddd m2, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m4, m7, m3 ; t7 + psubd m7, m3 ; t6a + mova m6, [o(clip_18b_min)] + REPX {pmaxsd x, m6 }, m1, m2, m7, m4 + mova m6, [o(clip_18b_max)] + REPX {pminsd x, m6 }, m1, m2, m7, m4 + mova m6, [r3+3*16] + mova [r3+3*16], m2 + mova m2, [r3+1*16] + mova [r3+1*16], m4 + + ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3 + mova m3, [o(pd_2896)] + mova m5, [r3+0*16] + mova m4, [r3+2*16] + REPX {pmulld x, m3 }, m5, m4 +.main_pass1_fast2: + REPX {paddd x, m0 }, m2, m6 + REPX {psrad x, 12 }, m2, m6 + REPX {pmulld x, m3 }, m7, m1 + paddd m7, m0 + paddd m0, m5 + + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + + mova m1, [o(clip_18b_min)] + REPX {pmaxsd x, m1 }, m0, m6, m5, m3 + mova m1, [o(clip_18b_max)] + REPX {pminsd x, m1 }, m0, m6, m5, m3 + ret +.round: + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + mova [r3+0*16], m6 + mova m6, [r3+1*16] + psubd m7, m0, m6 ; out7 + paddd m0, m6 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + mova m6, [r3+3*16] + psubd m4, m3, m6 ; out4 + paddd m3, m6 ; out3 + mova m6, [r3+0*16] +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x4_internal_8bpc, _ssse3).main +.end: + lea r3, [strideq*3] + call .round2_and_write_8x4 + REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + RET +.round2_and_write_8x4: + pxor m6, m6 + mova m5, [o(pixel_10bpc_max)] + mova m4, [o(pw_2048)] +.round1_and_write_8x4: + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 +.write_8x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3] + REPX {pminsw x, m5}, m0, m1, m2, m3 + REPX {pmaxsw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + ret + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + lea r5, [o(.main)] + jmp m(idct_8x4_internal_16bpc).pass1_entry +.main: + call .main_pass1 + call .round + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + ret +.main_pass1: +%if ARCH_X86_64 + ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a + psubd m8, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + psubd m4, m5, m1 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7 + REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7 + ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a + psubd m9, m6, m8 ; t7 + paddd m6, m8 ; out6 + mova m8, [o(pd_2896)] + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m2 ; t2 + paddd m0, m2 ; out0 + psubd m2, m1, m4 ; t6 + paddd m1, m4 ; -out1 + REPX {pmaxsd x, m12}, m5, m3, m2, m9 + REPX {pminsd x, m13}, m5, m3, m2, m9 + REPX {pmulld x, m14}, m5, m3, m2, m9 + psubd m4, m5, m3 ; (t2 - t3) * 2896 + paddd m3, m5 ; (t2 + t3) * 2896 + psubd m5, m2, m9 ; (t6 - t7) * 2896 + paddd m2, m9 ; (t6 + t7) * 2896 + ret +.round: + + ; m0=out0,m1=-out1,m6=out6,m7=-out7 + + pcmpeqd m8, m8 + REPX {pxor x, m8 }, m1, m7, m3, m5 + REPX {psubd x, m8 }, m1, m7 + REPX {paddd x, m11}, m2, m3, m4, m5 + REPX {psrad x, 12 }, m2, m3, m4, m5 +%else + mova [r3+0*16], m2 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m5 + mova m5, [o(pd_2048)] + + ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a + mova m2, [r3+0*16] + mova m3, [r3+1*16] + mova m4, [r3+2*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m6 + mova m1, [r3+3*16] + mova [r3+3*16], m7 + ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a + mova m0, [r3+0*16] + mova m6, [r3+2*16] + psubd m7, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + mova [r3+0*16], m7 + mova m5, [r3+1*16] + mova m7, [r3+3*16] + psubd m4, m1, m5 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 + mova [r3+1*16], m7 + mova m7, [o(clip_18b_max)] + pmaxsd m3, [r3+0*16] + REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 + pminsd m7, [r3+1*16] + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m5 + mova [r3+3*16], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a + mova m5, [r3+2*16] + mova m7, [r3+3*16] + psubd m2, m6, m3 ; t7 + paddd m6, m3 ; out6 + mova [r3+3*16], m6 + mova m0, [r3+0*16] + mova m6, [r3+1*16] + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m6 ; t2 + paddd m0, m6 ; out0 + psubd m6, m1, m4 ; t6 + paddd m1, m4 ; -out1 + mova m4, [o(clip_18b_min)] + REPX {pmaxsd x, m4 }, m5, m3, m6, m2 + mova m4, [o(clip_18b_max)] + REPX {pminsd x, m4 }, m5, m3, m6, m2 + mova m4, [o(pd_2896)] + REPX {pmulld x, m4 }, m5, m3, m6, m2 + psubd m4, m5, m3 ; (t2 - t3) * 2896 + paddd m3, m5 ; (t2 + t3) * 2896 + psubd m5, m6, m2 ; (t6 - t7) * 2896 + paddd m2, m6 ; (t6 + t7) * 2896 + ret +.round: + mova [r3+2*16], m0 + + pcmpeqd m0, m0 + mova m6, [o(pd_2048)] + REPX {pxor x, m0 }, m1, m7, m3, m5 + REPX {psubd x, m0 }, m1, m7 + REPX {paddd x, m6 }, m2, m3, m4, m5 + REPX {psrad x, 12 }, m2, m3, m4, m5 + + mova m6, [r3+3*16] + mova m0, [r3+2*16] +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main + jmp m(idct_8x4_internal_16bpc).end + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + lea r5, [o(.main)] + jmp m(idct_8x4_internal_16bpc).pass1_entry +.main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call m(iadst_8x4_internal_16bpc).round + packssdw m7, m6 + packssdw m5, m4 + packssdw m3, m2 + packssdw m1, m0 + mova m0, m7 + mova m2, m5 + mova m4, m3 + mova m6, m1 + ret +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main + lea r3, [strideq*3] + add dstq, r3 + neg strideq + jmp m(idct_8x4_internal_16bpc).end + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + lea r5, [o(.main)] + jmp m(idct_8x4_internal_16bpc).pass1_entry +.main: + REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + ret +.pass2: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_8x4_internal_16bpc).end + +%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16 +%else + INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 2 +.end: + add r5d, 384 + sar r5d, 9 +.end2: + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 + lea r2, [strideq*3] +.loop: + mova m1, [dstq+strideq*0] + mova m2, [dstq+strideq*1] + mova m3, [dstq+strideq*2] + mova m4, [dstq+r2] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + mova [dstq+strideq*2], m3 + mova [dstq+r2 ], m4 + lea dstq, [dstq+strideq*4] + dec r3d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity, 6 +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + DECLARE_REG_TMP 1 + mov [rsp+4*16+1*gprsize], r1 +%else + DECLARE_REG_TMP 6 +%endif + lea t0, [o(.pass1_main)] + +.pass1_full: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 10 + setge r5b +%else + mov r5d, 1 + cmp eobd, 10 + sbb r5d, 0 +%endif + shl r5d, 4 +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif +.loop_pass1: + mova m0, [cq+0*32+r5] + mova m1, [cq+1*32+r5] + mova m2, [cq+2*32+r5] + mova m3, [cq+3*32+r5] + mova m4, [cq+4*32+r5] + mova m5, [cq+5*32+r5] + mova m6, [cq+6*32+r5] + mova m7, [cq+7*32+r5] + call t0 + + test r5d, r5d + jz .end_pass1 + + mova [cq+0*32+16], m0 + mova [cq+1*32+16], m1 + mova [cq+2*32+16], m2 + mova [cq+3*32+16], m3 + + sub r5d, 16 + jmp .loop_pass1 +.end_pass1: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_32 + mov r1, [rsp+4*16+1*gprsize] +%endif + jmp tx2q +.pass1_main: + call m(idct_8x4_internal_16bpc).main_pass1 + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + call m(idct_8x4_internal_16bpc).round + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 +.pack_and_transpose: + packssdw m2, m3 + packssdw m6, m7 + packssdw m0, m1 + packssdw m4, m5 + jmp m(idct_8x4_internal_16bpc).transpose4x8packed + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + lea r3, [strideq*3] +%if ARCH_X86_64 + mova m10, [o(pixel_10bpc_max)] + pxor m9, m9 +%endif + call .round3_and_write_8x8 +.zero: +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +%undef mzero + RET + + ; round (rounded right-shift by 5) before writing + ; data in m0-7 + ; on x86-64, pw_2048 is in m8 + ; .round1 is for m0-7 + ; .round2 is for m0-6 & [rsp+gprsize*2] + ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) + ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7 +%if ARCH_X86_32 +.round1_and_write_8x8: + mova [rsp+gprsize*2], m7 +.round2_and_write_8x8: +%endif +.round3_and_write_8x8: + mova m7, [o(pw_2048)] +%if ARCH_X86_32 +.round4_and_write_8x8: +%endif + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize*2] +%if ARCH_X86_64 + jmp .write_8x8 +.round2_and_write_8x8: + mova m7, [rsp+gprsize*2] +.round1_and_write_8x8: + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%endif + + ; m0-7 have to-be-written data [pre-rounded] + ; on x86-64, m9-10 contain a zero/pixel_max + ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch + ; r0,1,3 contain dstq/strideq/stride3q + ; r5 is a scratch register +.write_8x8: + lea r5, [dstq+strideq*4] + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3] + paddw m4, [r5 +strideq*0] + paddw m5, [r5 +strideq*1] + paddw m6, [r5 +strideq*2] + paddw m7, [r5 +r3] +%if ARCH_X86_64 + REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [rsp+gprsize*2], m7 + pxor m7, m7 + REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmaxsw m7, [rsp+gprsize*2] + mova [rsp+gprsize*2], m7 + mova m7, [o(pixel_10bpc_max)] + REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsw m7, [rsp+gprsize*2] +%endif + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + mova [r5 +strideq*0], m4 + mova [r5 +strideq*1], m5 + mova [r5 +strideq*2], m6 + mova [r5 +r3 ], m7 + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity, 6 + +cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+4*16+1*gprsize], r1 +%endif + lea t0, [o(.pass1_main)] + jmp m(idct_8x8_internal_16bpc).pass1_full +.pass1_main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call .round + jmp m(idct_8x8_internal_16bpc).pack_and_transpose +.round: +%if ARCH_X86_64 + pcmpeqd m8, m8 ; -1 + REPX {psubd x, m8 }, m0, m6 + REPX {pxor x, m8 }, m1, m7, m3, m5 + REPX {psrad x, 1 }, m0, m1, m6, m7 + REPX {psubd x, m8 }, m1, m7 + mova m8, [o(pd_6144)] + REPX {paddd x, m8 }, m2, m3, m4, m5 + REPX {psrad x, 13 }, m2, m3, m4, m5 +%else + mova [r3+2*16], m0 + + pcmpeqd m0, m0 ; -1 + mova m6, [o(pd_6144)] + REPX {pxor x, m0 }, m1, m7, m3, m5 + REPX {psrad x, 1 }, m1, m7 + REPX {psubd x, m0 }, m1, m7 + REPX {paddd x, m6 }, m2, m3, m4, m5 + REPX {psrad x, 13 }, m2, m3, m4, m5 + + mova m0, [r3+2*16] + psrld m6, 12 ; +1 + paddd m0, m6 + paddd m6, [r3+3*16] + REPX {psrad x, 1 }, m0, m6 +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end + lea r3, [strideq*3] +%if ARCH_X86_64 + mova m10, [o(pixel_10bpc_max)] + pxor m9, m9 +%endif + call .round3_and_write_8x8 + jmp m(idct_8x8_internal_16bpc).zero + + ; round (rounded right-shift by 5) before writing; odd registers are negated + ; data in m0-7 + ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11 + ; .round1 is for m0-7 + ; .round2 is for m0-6 & [rsp+gprsize*2] + ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) +%if ARCH_X86_64 +.round2_and_write_8x8: + mova m7, [rsp+gprsize*2] +.round1_and_write_8x8: + REPX {pmulhrsw x, m8 }, m0, m2, m4, m6 + REPX {pmulhrsw x, m11}, m1, m3, m5, m7 + jmp m(idct_8x8_internal_16bpc).write_8x8 +%else +.round1_and_write_8x8: + mova [rsp+gprsize*2], m7 +.round2_and_write_8x8: +%endif +.round3_and_write_8x8: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova m7, [o(pw_m2048)] + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize*2] + jmp m(idct_8x8_internal_16bpc).write_8x8 + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity, 6 + +cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+4*16+1*gprsize], r1 +%endif + lea t0, [o(.pass1_main)] + jmp m(idct_8x8_internal_16bpc).pass1_full +.pass1_main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call m(iadst_8x8_internal_16bpc).round + ; invert registers + packssdw m7, m6 + packssdw m5, m4 + packssdw m3, m2 + packssdw m1, m0 + mova m0, m7 + mova m2, m5 + mova m4, m3 + mova m6, m1 + jmp m(idct_8x4_internal_16bpc).transpose4x8packed + +.pass2: + lea dstq, [dstq+strideq*8] + sub dstq, strideq + neg strideq + jmp m(iadst_8x8_internal_16bpc).pass2 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m0, [cq+0*32] + mova m1, [cq+1*32] + mova m2, [cq+2*32] + mova m3, [cq+3*32] + mova m4, [cq+4*32] + mova m5, [cq+5*32] + mova m6, [cq+6*32] + mova m7, [cq+7*32] + packssdw m0, [cq+0*32+16] + packssdw m1, [cq+1*32+16] + packssdw m2, [cq+2*32+16] + packssdw m3, [cq+3*32+16] + packssdw m4, [cq+4*32+16] + packssdw m5, [cq+5*32+16] + packssdw m6, [cq+6*32+16] + packssdw m7, [cq+7*32+16] + mova [rsp+gprsize+16*1], m6 + jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3 + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + lea r3, [strideq*3] +%if ARCH_X86_64 + mova m10, [o(pixel_10bpc_max)] + pxor m9, m9 + mova m8, [o(pw_4096)] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 +%else + mova [rsp+gprsize], m7 + mova m7, [o(pw_4096)] + call m(idct_8x8_internal_16bpc).round4_and_write_8x8 +%endif + jmp m(idct_8x8_internal_16bpc).zero + +%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16 +%else + INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + mov r3d, 4 +%if stack_size_padded > 0 + ; adjust to caller's stack allocation + add rsp, (12+ARCH_X86_64)*16 +%endif + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, v +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +%if ARCH_X86_64 +DECLARE_REG_TMP 7 +%endif + +cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)] +.pass1_full: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif +%undef cmp + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, [rsp+16*16+2*gprsize] + ; setup stack pointer + lea r3, [rsp+gprsize] +%endif +.loop_pass1: + mova m0, [cq+0*64+r5] + mova m1, [cq+1*64+r5] + mova m2, [cq+2*64+r5] + mova m3, [cq+3*64+r5] + mova m4, [cq+4*64+r5] + mova m5, [cq+5*64+r5] + mova m6, [cq+6*64+r5] + mova m7, [cq+7*64+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + call t0 + + mova [cq+0*64+r5], m0 + mova [cq+1*64+r5], m1 + mova [cq+2*64+r5], m2 + mova [cq+3*64+r5], m3 + sub r5d, 16 + jge .loop_pass1 +%if WIN64 + POP r7 +%elif ARCH_X86_32 + mov r1, [rsp+16*16+1*gprsize] +%endif + jmp tx2q + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + + ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15 + ; some are still pre-loaded from the final loop iteration in pass=1 + + mova m1, m2 + mova m2, [cq+ 1*16] + mova m3, [cq+ 9*16] + mova m4, [cq+ 2*16] + mova m5, [cq+10*16] + mova m6, [cq+ 3*16] + mova m7, [cq+11*16] + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+3*16], m0 + mova [rsp+gprsize+4*16], m1 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m3 + mova [rsp+gprsize+7*16], m4 + mova [rsp+gprsize+8*16], m5 + mova [rsp+gprsize+9*16], m6 + ; m7 is already stored in [rsp+gprsize+0*16] + mova m0, [cq+ 4*16] + mova m1, [cq+12*16] + mova m2, [cq+ 5*16] + mova m3, [cq+13*16] + mova m4, [cq+ 6*16] + mova m5, [cq+14*16] + mova m6, [cq+ 7*16] + mova m7, [cq+15*16] + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + + ; out0-7 is in rsp+gprsize+3-10*mmsize + ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize + +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + mova m10, [o(pixel_10bpc_max)] + pxor m9, m9 + mov r6, dstq +%else + mov [rsp+16*16+gprsize*1], dstq +%endif + lea r3, [strideq*3] + lea dstq, [dstq+strideq*8] + call m(idct_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +%undef mzero + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + mova m4, [rsp+gprsize+ 7*16] + mova m5, [rsp+gprsize+ 8*16] + mova m6, [rsp+gprsize+ 9*16] + mova m7, [rsp+gprsize+10*16] +%if ARCH_X86_64 + mov dstq, r6 +%else + mov dstq, [rsp+16*16+gprsize*1] +%endif + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + RET + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, v + +cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m4, [cq+ 9*16] + mova m5, [cq+13*16] + mova [rsp+gprsize+7*16], m0 + mova [rsp+gprsize+8*16], m1 + mova [rsp+gprsize+5*16], m4 + mova [rsp+gprsize+6*16], m5 + mova m0, m2 + mova m1, m3 + mova m2, [cq+ 1*16] + mova m3, [cq+ 5*16] + mova m4, [cq+ 2*16] + mova m5, [cq+ 6*16] + mova m6, [cq+11*16] + mova m7, [cq+15*16] + mova [rsp+gprsize+ 3*16], m4 + mova [rsp+gprsize+ 4*16], m5 + mova [rsp+gprsize+ 9*16], m6 + mova [rsp+gprsize+10*16], m7 + mova m4, [cq+10*16] + mova m5, [cq+14*16] + mova m6, [cq+ 3*16] + mova m7, [cq+ 7*16] + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end + +%if ARCH_X86_64 + mova m11, [o(pw_m2048)] + mova m8, [o(pw_2048)] + mova m10, [o(pixel_10bpc_max)] + pxor m9, m9 + mov r6, dstq +%else + mov [rsp+16*16+gprsize*1], dstq +%endif + lea r3, [strideq*3] + lea dstq, [dstq+strideq*8] + call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +%undef mzero + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + mova m4, [rsp+gprsize+ 7*16] + mova m5, [rsp+gprsize+ 8*16] + mova m6, [rsp+gprsize+ 9*16] + mova m7, [rsp+gprsize+10*16] +%if ARCH_X86_64 + mov dstq, r6 +%else + mov dstq, [rsp+16*16+gprsize*1] +%endif + call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 + RET + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, v + +cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: + lea r3, [strideq*3] + lea r3, [r3*5] + add dstq, r3 + neg strideq + jmp m(iadst_8x16_internal_16bpc).pass2 + +INV_TXFM_8X16_FN identity, dct, h +INV_TXFM_8X16_FN identity, adst, h +INV_TXFM_8X16_FN identity, flipadst, h +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: +%if ARCH_X86_64 + mova m4, [o(pw_2048)] + mova m5, [o(pixel_10bpc_max)] + pxor m6, m6 + mova m7, [o(pw_1697x16)] +%endif + mov r5d, 4 + lea r3, [strideq*3] +.pass2_loop: + call .main +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).round1_and_write_8x4 +%else + call m(idct_8x4_internal_16bpc).round2_and_write_8x4 +%endif + REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28 + dec r5d + jle .end + add cq, 16 + lea dstq, [dstq+strideq*4] + mova m0, [cq+ 0*16] + mova m1, [cq+ 4*16] + mova m2, [cq+ 8*16] + mova m3, [cq+12*16] + jmp .pass2_loop +.end: + RET +.main: + ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y) +%if ARCH_X86_32 + mova m7, [o(pw_1697x16)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 +%else + pmulhrsw m8, m7, m0 + pmulhrsw m9, m7, m1 + pmulhrsw m10, m7, m2 + pmulhrsw m11, m7, m3 +%endif + REPX {paddsw x, x}, m0, m1, m2, m3 +%if ARCH_X86_64 + paddsw m0, m8 + paddsw m1, m9 + paddsw m2, m10 + paddsw m3, m11 +%else + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 +%endif + ret + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16 +%else + INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 4 +.dconly: + add r5d, 384 + sar r5d, 9 +.dconly2: + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m3, [o(pixel_10bpc_max)] + pxor m4, m4 +.loop: + mova m1, [dstq+ 0] + mova m2, [dstq+16] + REPX {paddw x, m0}, m1, m2 + REPX {pminsw x, m3}, m1, m2 + REPX {pmaxsw x, m4}, m1, m2 + mova [dstq+ 0], m1 + mova [dstq+16], m2 + add dstq, strideq + dec r3d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, identity +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst + +cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + ; setup stack pointer + lea r3, [rsp+gprsize] + + mova m0, [cq+ 1*16] + mova m1, [cq+ 3*16] + mova m2, [cq+ 5*16] + mova m3, [cq+ 7*16] + mova m4, [cq+ 9*16] + mova m5, [cq+11*16] + mova m6, [cq+13*16] + mova m7, [cq+15*16] + call .main_oddhalf + mova m0, [cq+ 0*16] + mova m1, [cq+ 2*16] + mova m2, [cq+ 4*16] + mova m3, [cq+ 6*16] + mova m4, [cq+ 8*16] + mova m5, [cq+10*16] + mova m6, [cq+12*16] + mova m7, [cq+14*16] + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + ; t0-7 is in m0-7 + + call .round + +%if ARCH_X86_64 +.pack_transpose: + ; transpose in two parts + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +.transpose: + call m(idct_8x4_internal_16bpc).transpose4x8packed + call .transpose4x8packed_hi +%else + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m2 + mova [r3+3*16], m3 + mova m0, [r3+ 8*16] + mova m2, [r3+ 9*16] + mova m4, [r3+10*16] + mova m6, [r3+11*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + jmp tx2q +%if ARCH_X86_64 +.transpose4x8packed_hi: + punpcklwd m9, m10, m14 + punpckhwd m10, m14 + punpckhwd m14, m8, m12 + punpcklwd m8, m12 + + punpckhwd m11, m8, m9 + punpcklwd m8, m9 + punpckhwd m12, m14, m10 + punpcklwd m14, m10 + + punpcklwd m10, m11, m12 + punpckhwd m11, m12 + punpckhwd m9, m8, m14 + punpcklwd m8, m14 + ret +%endif +.main_oddhalf_fast: ; lower half zero + pmulld m7, m0, [o(pd_4076)] + pmulld m0, [o(pd_401)] + pmulld m6, m1, [o(pd_m1189)] + pmulld m1, [o(pd_3920)] +%if ARCH_X86_32 + mova m4, [o(pd_2048)] + REPX {paddd x, m4}, m1, m6 + REPX {psrad x, 12}, m1, m6 + mova [r3+1*16], m1 +%endif + pmulld m5, m2, [o(pd_3612)] + pmulld m2, [o(pd_1931)] +%if ARCH_X86_32 + pmulld m1, m3, [o(pd_m2598)] +%else + pmulld m4, m3, [o(pd_m2598)] +%endif + pmulld m3, [o(pd_3166)] + jmp .main_oddhalf_fast2 +.main_oddhalf: +%if ARCH_X86_64 + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a +.main_oddhalf_fast2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m8, m0, m4 ; t9 + paddd m0, m4 ; t8 + psubd m4, m6, m2 ; t10 + paddd m2, m6 ; t11 + psubd m6, m1, m5 ; t13 + paddd m5, m1 ; t12 + psubd m1, m7, m3 ; t14 + paddd m7, m3 ; t15 + REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 + REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 + mova m15, [o(pd_3784)] + mova m10, [o(pd_1567)] + ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 + psubd m3, m1, m4 ; t10 + paddd m1, m4 ; t9 + psubd m4, m0, m2 ; t11a + paddd m0, m2 ; t8a + psubd m2, m8, m6 ; t13 + paddd m6, m8 ; t14 + psubd m8, m7, m5 ; t12a + paddd m7, m5 ; t15a + REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pmulld x, m14}, m2, m8, m3, m4 + paddd m2, m11 + paddd m8, m11 + paddd m5, m2, m3 ; t13a + psubd m2, m3 ; t10a + psubd m3, m8, m4 ; t11 + paddd m4, m8 ; t12 + REPX {psrad x, 12}, m5, m2, m3, m4 + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m2 + mova [r3+3*16], m3 + mova [r3+4*16], m4 + mova [r3+5*16], m5 + mova [r3+6*16], m6 + mova [r3+7*16], m7 +%else + mova [r3+0*16], m2 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m5 + mova m4, [o(pd_2048)] + + ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a + + mova m2, [r3+0*16] + mova m3, [r3+1*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova m1, [r3+2*16] + mova m5, [r3+3*16] + mova [r3+2*16], m6 + mova [r3+3*16], m7 + + ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a + + mova m0, [r3+0*16] + mova m6, [r3+2*16] + mova m7, [r3+3*16] +.main_oddhalf_fast2: + REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3 + REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3 + psubd m4, m0, m1 ; t9 + paddd m0, m1 ; t8 + mova m1, [r3+1*16] + mova [r3+0*16], m4 + psubd m4, m6, m2 ; t10 + paddd m2, m6 ; t11 + psubd m6, m1, m5 ; t13 + paddd m5, m1 ; t12 + psubd m1, m7, m3 ; t14 + paddd m7, m3 ; t15 + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7 + pmaxsd m3, [r3+0*16] + mova [r3+0*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7 + pminsd m3, [r3+0*16] + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m5 + mova [r3+3*16], m7 + mova m7, [o(pd_2048)] + ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784 + ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4 + mova m0, [r3+0*16] + mova m2, [r3+1*16] + psubd m5, m1, m4 ; t10 + mova [r3+1*16], m5 + paddd m1, m4 ; t9 + psubd m4, m0, m2 ; t11a + paddd m0, m2 ; t8a + mova m5, [r3+2*16] + mova m7, [r3+3*16] + psubd m2, m3, m6 ; t13 + paddd m6, m3 ; t14 + paddd m3, m7, m5 ; t15a + psubd m7, m5 ; t12a + mova [r3+0*16], m3 + mova m3, [r3+1*16] + mova m5, [o(clip_18b_min)] + REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6 + pmaxsd m5, [r3+0*16] + mova [r3+0*16], m5 + mova m5, [o(clip_18b_max)] + REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6 + pminsd m5, [r3+0*16] + mova [r3+0*16], m5 + mova m5, [o(pd_2896)] + REPX {pmulld x, m5}, m2, m7, m3, m4 + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m2, m7 + paddd m5, m2, m3 ; t13a + psubd m2, m3 ; t10a + psubd m3, m7, m4 ; t11 + paddd m4, m7 ; t12 + REPX {psrad x, 12}, m5, m2, m3, m4 + mova m7, [r3+0*16] + mova [r3+11*16], m0 + mova [r3+10*16], m1 + mova [r3+9*16], m2 + mova [r3+8*16], m3 + mova [r3+7*16], m4 + mova [r3+6*16], m5 + mova [r3+5*16], m6 + mova [r3+4*16], m7 +%endif + ret +.round: +%if ARCH_X86_64 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + pcmpeqd m8, m8 + REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + mova m8, [r3+1*16] + mova m9, [r3+2*16] + mova m10, [r3+3*16] + mova m11, [r3+4*16] + mova m12, [r3+5*16] + mova m13, [r3+6*16] + mova m14, [r3+7*16] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r3+0*16] ; out8 + paddd m7, [r3+0*16] ; out7 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ; and out0-15 is now in m0-15 +%else + mova [r3+ 0*16], m0 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 + pmaxsd m0, [r3+ 0*16] + mova [r3+ 0*16], m7 + mova m7, [o(clip_18b_max)] + REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsd m7, [r3+ 0*16] + mova [r3+ 0*16], m0 + pcmpeqd m0, m0 + REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7 + mova [r3+ 1*16], m1 + mova [r3+ 2*16], m2 + mova m1, [r3+ 0*16] + psubd m1, m0 + mova [r3+ 0*16], m1 + mova m1, [r3+11*16] + mova m2, [r3+10*16] + psubd m0, m7, m1 + paddd m7, m1 + psubd m1, m6, m2 + paddd m6, m2 + REPX {psrad x, 1}, m0, m1, m6, m7 + packssdw m0, m1 ; out8-9 + packssdw m6, m7 ; out6-7 + mova [r3+11*16], m6 + mova m1, [r3+9*16] + mova m7, [r3+8*16] + psubd m2, m5, m1 + paddd m5, m1 + psubd m1, m4, m7 + paddd m4, m7 + REPX {psrad x, 1}, m2, m1, m4, m5 + packssdw m2, m1 ; out10-11 + packssdw m4, m5 ; out4-5 + mova m1, [r3+2*16] + mova [r3+10*16], m4 + mova m6, [r3+7*16] + mova m7, [r3+6*16] + psubd m4, m3, m6 + paddd m3, m6 + psubd m6, m1, m7 + paddd m1, m7 + REPX {psrad x, 1}, m4, m6, m1, m3 + packssdw m4, m6 ; out12-13 + packssdw m1, m3 ; out2-3 + mova m3, [r3+1*16] + mova [r3+9*16], m1 + mova m1, [r3+0*16] + mova m5, [r3+5*16] + mova m7, [r3+4*16] + psubd m6, m3, m5 + paddd m3, m5 + psubd m5, m1, m7 + paddd m1, m7 + REPX {psrad x, 1}, m6, m5, m1, m3 + packssdw m6, m5 ; out14-15 + packssdw m1, m3 ; out0-1 + mova [r3+8*16], m1 +%endif + ret + +.pass2: + lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)] +.pass2_loop: + lea r3, [strideq*3] +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call r4 + call m(idct_8x4_internal_16bpc).round2_and_write_8x4 + REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +%if ARCH_X86_64 + mova m0, m8 + mova m1, m9 + mova m2, m10 + mova m3, m11 +%else + mova m0, [rsp+gprsize+0*16] + mova m1, [rsp+gprsize+1*16] + mova m2, [rsp+gprsize+2*16] + mova m3, [rsp+gprsize+3*16] +%endif + add dstq, 16 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call r4 + call m(idct_8x4_internal_16bpc).round2_and_write_8x4 + RET + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + ; setup stack pointer + lea r3, [rsp+gprsize] + call .main +%if ARCH_X86_64 + jmp m(idct_16x4_internal_16bpc).pack_transpose +%else + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+gprsize+0*16], m0 + mova [rsp+gprsize+1*16], m1 + mova [rsp+gprsize+2*16], m2 + mova [rsp+gprsize+3*16], m3 + mova m0, [rsp+gprsize+ 8*16] + mova m2, [rsp+gprsize+ 9*16] + mova m4, [rsp+gprsize+10*16] + mova m6, [rsp+gprsize+11*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + jmp tx2q +%endif + +.main: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 2*16] + mova m1, [cq+13*16] + mova m2, [cq+ 6*16] + mova m3, [cq+ 9*16] + mova m4, [cq+10*16] + mova m5, [cq+ 5*16] + mova m6, [cq+14*16] + mova m7, [cq+ 1*16] + call .main_part1 + mova m0, [cq+ 0*16] + mova m1, [cq+15*16] + mova m2, [cq+ 4*16] + mova m3, [cq+11*16] + mova m4, [cq+ 8*16] + mova m5, [cq+ 7*16] + mova m6, [cq+12*16] + mova m7, [cq+ 3*16] + call .main_part2 +.round: +%if ARCH_X86_64 + mova m15, [o(pd_6144)] + psrld m14, 11 ; pd_1 + pcmpeqd m8, m8 ; -1 + psubd m13, m15, m14 ; pd_6143 + REPX {paddd x, m14}, m0, m2 + REPX {paddd x, m15}, m4, m6 + REPX {pxor x, m8 }, m1, m3, m5, m7 + REPX {psrad x, 1 }, m1, m3 + REPX {paddd x, m15}, m5, m7 + REPX {psubd x, m8 }, m1, m3 + paddd m8, m15, m9 + psubd m9, m13, m10 + paddd m10, m15, m11 + psubd m11, m13, m12 + paddd m12, m14, [r3+3*16] + psubd m13, m14, [r3+2*16] + psubd m15, m14, [r3+0*16] + paddd m14, [r3+1*16] + REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 +%else + mova [r3+8*16], m1 + mova [r3+9*16], m3 + mova m3, [o(pd_6144)] + pcmpeqd m1, m1 + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m4, m5, m6, m7 + REPX {psrad x, 13}, m4, m5, m6, m7 + packssdw m4, m5 + packssdw m6, m7 + mova [r3+10*16], m4 + mova [r3+11*16], m6 + mova m4, [r3+4*16] + mova m5, [r3+5*16] + mova m6, [r3+6*16] + mova m7, [r3+7*16] + REPX {pxor x, m1}, m5, m7 + REPX {psubd x, m1}, m4, m6 + REPX {psrad x, 1 }, m4, m5, m6, m7 + REPX {psubd x, m1}, m5, m7 + packssdw m4, m5 + packssdw m6, m7 + mova m5, [r3+8*16] + mova m7, [r3+9*16] + mova [r3+8*16], m4 + mova [r3+9*16], m6 + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m0, m5, m2, m7 + REPX {psrad x, 13}, m0, m5, m2, m7 + packssdw m0, m5 + packssdw m2, m7 + mova m4, [r3+0*16] + mova m5, [r3+1*16] + mova m6, [r3+2*16] + mova m7, [r3+3*16] + REPX {psubd x, m1}, m4, m6 + REPX {pxor x, m1}, m5, m7 + REPX {psrad x, 1 }, m4, m5, m6, m7 + REPX {psubd x, m1}, m5, m7 + packssdw m4, m5 + packssdw m6, m7 +%endif + ret + +.main_part2: +%if ARCH_X86_64 + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380 + psubd m8, m0, m4 ; t8a + paddd m0, m4 ; t0a + psubd m4, m1, m5 ; t9a + paddd m1, m5 ; t1a + psubd m5, m2, m6 ; t12a + paddd m2, m6 ; t4a + psubd m6, m3, m7 ; t13a + paddd m7, m3 ; t5a + REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + mova m15, [o(pd_4017)] + mova m10, [o(pd_799)] + ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 + psubd m3, m0, m2 ; t4 + paddd m0, m2 ; t0 + psubd m2, m1, m7 ; t5 + paddd m1, m7 ; t1 + psubd m7, m4, m6 ; t12a + paddd m4, m6 ; t8a + psubd m6, m8, m5 ; t13a + paddd m5, m8 ; t9a + REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + mova m15, [o(pd_3784)] + mova m10, [o(pd_1567)] + ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15 + ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15 + mova m10, [r3+0*16] ; t2 + mova m8, [r3+1*16] ; t3 + psubd m9, m0, m10 ; t2a + paddd m0, m10 ; out0 + psubd m10, m1, m8 ; t3a + paddd m1, m8 ; -out15 + mova [r3+0*16], m1 + mova m15, [r3+3*16] ; t7a + mova m1, [r3+2*16] ; t6a + psubd m8, m3, m15 ; t7 + paddd m15, m3 ; out12 + paddd m3, m2, m1 ; -out3 + psubd m2, m1 ; t6 + mova [r3+3*16], m15 + mova [r3+1*16], m2 + mova m1, [r3+7*16] ; t15 + mova m2, [r3+6*16] ; t14 + paddd m15, m7, m1 ; -out13 + psubd m7, m1 ; t15a + psubd m11, m6, m2 ; t14a + paddd m2, m6 ; out2 + mova [r3+2*16], m15 + mova m1, [r3+4*16] ; t10a + mova m15, [r3+5*16] ; t11a + psubd m6, m4, m1 ; t10 + paddd m1, m4 ; -out1 + psubd m4, m5, m15 ; t11 + paddd m5, m15 ; out14 + REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8 + pmaxsd m12, [r3+1*16] ; t6 + mova [r3+1*16], m5 + REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8 + REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8 + paddd m5, m11, m7 ; -out5 (unshifted) + psubd m11, m7 ; out10 (unshifted) + paddd m7, m9, m10 ; -out7 (unshifted) + psubd m9, m10 ; out8 (unshifted) + psubd m10, m6, m4 ; -out9 (unshifted) + paddd m6, m4 ; out6 (unshifted) + paddd m4, m12, m8 ; out4 (unshifted) + psubd m12, m8 ; -out11 (unshifted) +%else + mova [r3+8*16], m0 + mova [r3+9*16], m1 + mova [r3+10*16], m2 + mova [r3+11*16], m3 + mova m3, [o(pd_2048)] + ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751 + ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380 + mova m0, [r3+8*16] + mova m1, [r3+9*16] + mova [r3+8*16], m4 + mova m4, [r3+10*16] + mova [r3+9*16], m5 + mova [r3+10*16], m6 + mova m5, [r3+11*16] + mova [r3+11*16], m7 + ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091 + ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703 + mova m2, [r3+8*16] + mova m6, [r3+9*16] + psubd m3, m0, m2 ; t8a + paddd m0, m2 ; t0a + mova [r3+8*16], m3 + psubd m2, m1, m6 ; t9a + paddd m1, m6 ; t1a + mova m3, [r3+10*16] + psubd m6, m4, m3 ; t12a + paddd m4, m3 ; t4a + mova m3, [r3+11*16] + psubd m7, m5, m3 ; t13a + paddd m5, m3 ; t5a + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5 + pmaxsd m3, [r3+8*16] + mova [r3+8*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5 + pminsd m3, [r3+8*16] + mova [r3+8*16], m3 + psubd m3, m0, m4 ; t4 + paddd m0, m4 ; t0 + psubd m4, m1, m5 ; t5 + paddd m1, m5 ; t1 + mova m5, [o(pd_2048)] + mova [r3+9*16], m1 + mova [r3+10*16], m4 + mova [r3+11*16], m3 + mova m3, [r3+8*16] + mova [r3+8*16], m0 + ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017 + ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4 + psubd m5, m2, m7 ; t12a + paddd m2, m7 ; t8a + psubd m7, m3, m6 ; t13a + paddd m6, m3 ; t9a + mova m0, [r3+8*16] + mova m1, [r3+9*16] + mova m4, [r3+10*16] + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6 + pmaxsd m3, [r3+11*16] + mova [r3+8*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6 + pminsd m3, [r3+8*16] + mova [r3+8*16], m0 + mova [r3+9*16], m1 + mova [r3+10*16], m2 + mova [r3+11*16], m6 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784 + ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784 + mova m0, [r3+7*16] ; t7a + mova m2, [r3+6*16] ; t6a + psubd m1, m3, m0 ; t7 + paddd m0, m3 ; out12 + paddd m3, m4, m2 ; -out3 + psubd m4, m2 ; t6 + mova [r3+7*16], m3 + mova m3, [r3+3*16] ; t15 + mova m2, [r3+2*16] ; t14 + paddd m6, m5, m3 ; -out13 + psubd m5, m3 ; t15a + psubd m3, m7, m2 ; t14a + paddd m2, m7 ; out2 + mova [r3+6*16], m2 + mova m7, [r3+0*16] ; t10a + mova m2, [r3+1*16] ; t11a + mova [r3+0*16], m0 + mova [r3+1*16], m6 + mova m6, [r3+11*16] + psubd m0, m6, m2 ; t11 + paddd m6, m2 ; out14 + mova [r3+2*16], m6 + mova m2, [r3+10*16] + psubd m6, m2, m7 ; t10 + paddd m2, m7 ; -out1 + mova m7, [r3+5*16] ; t3 + mova [r3+5*16], m2 + mova [r3+10*16], m1 + mova m1, [r3+9*16] + psubd m2, m1, m7 ; t3a + paddd m1, m7 ; -out15 + mova [r3+3*16], m1 + mova m1, [r3+4*16] ; t2 + mova m7, [r3+8*16] + psubd m7, m1 ; t2a + paddd m1, [r3+8*16] ; out0 + mova [r3+4*16], m1 + mova m1, [o(clip_18b_min)] + REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7 + pmaxsd m1, [r3+10*16] + mova [r3+10*16], m1 + mova m1, [o(clip_18b_max)] + REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7 + pminsd m1, [r3+10*16] + mova [r3+10*16], m1 + mova m1, [o(pd_2896)] + REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7 + pmulld m1, [r3+10*16] + mova [r3+11*16], m3 + psubd m3, m4, m1 ; -out11 (unshifted) + paddd m4, m1 ; out4 (unshifted) + psubd m1, m6, m0 ; -out9 (unshifted) + paddd m6, m0 ; out6 (unshifted) + psubd m0, m7, m2 ; out8 (unshifted) + paddd m7, m2 ; -out7 (unshifted) + mova m2, [r3+11*16] + mova [r3+11*16], m5 + paddd m5, m2 ; -out5 (unshifted) + psubd m2, [r3+11*16] ; out10 (unshifted) + ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted) + ; r[-4,3] contain out0-3 and out12-15 +%endif + ret +.main_part1: +%if ARCH_X86_64 + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601 + psubd m8, m0, m4 ; t10a + paddd m0, m4 ; t2a + psubd m4, m1, m5 ; t11a + paddd m1, m5 ; t3a + psubd m5, m2, m6 ; t14a + paddd m2, m6 ; t6a + psubd m6, m3, m7 ; t15a + paddd m7, m3 ; t7a + REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + mova m15, [o(pd_2276)] + mova m10, [o(pd_3406)] + ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 + psubd m3, m0, m2 ; t6 + paddd m0, m2 ; t2 + psubd m2, m1, m7 ; t7 + paddd m1, m7 ; t3 + psubd m7, m4, m6 ; t14a + paddd m4, m6 ; t10a + psubd m6, m8, m5 ; t15a + paddd m5, m8 ; t11a + REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + mova m15, [o(pd_1567)] + mova m10, [o(pd_3784)] + ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15 + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+4*16], m4 + mova [r3+5*16], m5 + mova [r3+2*16], m2 + mova [r3+3*16], m3 + mova [r3+6*16], m6 + mova [r3+7*16], m7 +%else + mova [r3+4*16], m0 + mova [r3+5*16], m1 + mova [r3+6*16], m2 + mova [r3+7*16], m3 + mova m3, [o(pd_2048)] + ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106 + ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601 + mova [r3+0*16], m4 + mova [r3+1*16], m5 + mova [r3+2*16], m6 + mova [r3+3*16], m7 + mova m0, [r3+4*16] + mova m1, [r3+5*16] + mova m2, [r3+6*16] + mova m7, [r3+7*16] + ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973 + ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290 + mova m4, [r3+0*16] + mova m5, [r3+1*16] + psubd m6, m0, m4 ; t10a + paddd m0, m4 ; t2a + mova [r3+4*16], m6 + mova m6, [r3+2*16] + mova m3, [r3+3*16] + psubd m4, m1, m5 ; t11a + paddd m1, m5 ; t3a + psubd m5, m2, m6 ; t14a + paddd m2, m6 ; t6a + psubd m6, m7, m3 ; t15a + paddd m7, m3 ; t7a + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7 + pmaxsd m3, [r3+4*16] + mova [r3+4*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7 + pminsd m3, [r3+4*16] + mova [r3+4*16], m3 + psubd m3, m0, m2 ; t6 + paddd m0, m2 ; t2 + psubd m2, m1, m7 ; t7 + paddd m1, m7 ; t3 + mova [r3+5*16], m1 + mova [r3+6*16], m3 + mova [r3+7*16], m2 + mova m1, [r3+4*16] + mova [r3+4*16], m0 + mova m3, [o(pd_2048)] + ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276 + ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2 + psubd m7, m4, m6 ; t14a + paddd m4, m6 ; t10a + psubd m6, m1, m5 ; t15a + paddd m5, m1 ; t11a + mova m1, [r3+5*16] + mova m3, [r3+6*16] + mova m2, [r3+7*16] + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5 + pmaxsd m0, [r3+4*16] + mova [r3+4*16], m0 + mova m0, [o(clip_18b_max)] + REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5 + pminsd m0, [r3+4*16] + mova [r3+4*16], m0 + mova [r3+5*16], m1 + mova [r3+0*16], m4 + mova [r3+1*16], m5 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567 + ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567 + mova [r3+6*16], m2 + mova [r3+7*16], m3 + mova [r3+2*16], m6 + mova [r3+3*16], m7 +%endif + ret + +.pass2: + lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] + jmp m(idct_16x4_internal_16bpc).pass2_loop + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + lea r3, [rsp+gprsize] + call m(iadst_16x4_internal_16bpc).main +%if ARCH_X86_64 + packssdw m1, m0 + packssdw m3, m2 + packssdw m5, m4 + packssdw m7, m6 + packssdw m9, m8 + packssdw m11, m10 + packssdw m13, m12 + packssdw m15, m14 + mova m0, m15 + mova m2, m13 + mova m4, m11 + mova m6, m9 + mova m8, m7 + mova m10, m5 + mova m12, m3 + mova m14, m1 + jmp m(idct_16x4_internal_16bpc).transpose +%else + mova [rsp+gprsize+4*16], m0 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m4 + mova [rsp+gprsize+7*16], m6 + pshufd m6, [rsp+gprsize+ 8*16], q1032 + pshufd m4, [rsp+gprsize+ 9*16], q1032 + pshufd m2, [rsp+gprsize+10*16], q1032 + pshufd m0, [rsp+gprsize+11*16], q1032 + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+gprsize+0*16], m0 + mova [rsp+gprsize+1*16], m1 + mova [rsp+gprsize+2*16], m2 + mova [rsp+gprsize+3*16], m3 + pshufd m6, [rsp+gprsize+ 4*16], q1032 + pshufd m4, [rsp+gprsize+ 5*16], q1032 + pshufd m2, [rsp+gprsize+ 6*16], q1032 + pshufd m0, [rsp+gprsize+ 7*16], q1032 + call m(idct_8x4_internal_16bpc).transpose4x8packed + jmp tx2q +%endif + +.pass2: + lea r3, [strideq*3] + lea dstq, [dstq+r3] + neg strideq + lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] + jmp m(idct_16x4_internal_16bpc).pass2_loop + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_64 + mova m15, [o(pd_11586)] + pmulld m0, m15, [cq+ 0*16] + pmulld m1, m15, [cq+ 1*16] + pmulld m2, m15, [cq+ 2*16] + pmulld m3, m15, [cq+ 3*16] + pmulld m4, m15, [cq+ 4*16] + pmulld m5, m15, [cq+ 5*16] + pmulld m6, m15, [cq+ 6*16] + pmulld m7, m15, [cq+ 7*16] + pmulld m8, m15, [cq+ 8*16] + pmulld m9, m15, [cq+ 9*16] + pmulld m10, m15, [cq+10*16] + pmulld m11, m15, [cq+11*16] + pmulld m12, m15, [cq+12*16] + pmulld m13, m15, [cq+13*16] + pmulld m14, m15, [cq+14*16] + pmulld m15, [cq+15*16] + mova [cq+ 0*16], m15 + mova m15, [o(pd_6144)] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [cq+ 0*16] + REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp m(idct_16x4_internal_16bpc).pack_transpose +%else + add cq, 8*16 + mov r5d, 2 +.loop_pass1: + mova m7, [o(pd_11586)] + pmulld m0, m7, [cq+0*16] + pmulld m1, m7, [cq+1*16] + pmulld m2, m7, [cq+2*16] + pmulld m3, m7, [cq+3*16] + pmulld m4, m7, [cq+4*16] + pmulld m5, m7, [cq+5*16] + pmulld m6, m7, [cq+6*16] + pmulld m7, [cq+7*16] + mova [cq+7*16], m7 + mova m7, [o(pd_6144)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [cq+7*16] + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + call m(idct_8x4_internal_16bpc).transpose4x8packed + dec r5d + jz .end_pass1 + mova [rsp+gprsize+0*16], m0 + mova [rsp+gprsize+1*16], m1 + mova [rsp+gprsize+2*16], m2 + mova [rsp+gprsize+3*16], m3 + sub cq, 8*16 + jmp .loop_pass1 +.end_pass1: + jmp tx2q +%endif + +.pass2: +%if ARCH_X86_64 + mova m12, [o(pw_1697x8)] +%endif + lea r4, [o(.main)] + jmp m(idct_16x4_internal_16bpc).pass2_loop +.main: +%if ARCH_X86_64 + pmulhrsw m4, m0, m12 + pmulhrsw m5, m1, m12 + pmulhrsw m6, m2, m12 + pmulhrsw m7, m3, m12 +%else + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m0, m7 + pmulhrsw m5, m1, m7 + pmulhrsw m6, m2, m7 + pmulhrsw m7, m3 +%endif + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + ret + +%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16 +%else + INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 8 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 +%if ARCH_X86_32 + add rsp, 1*16 +%endif + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity, 6 +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst + +cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_64 + DECLARE_REG_TMP 6, 4, 6 +%else + mov [rsp+gprsize+12*16], r1 + DECLARE_REG_TMP 1, 4, 3 +%endif + lea t0, [o(.main)] +.loop_main: +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 10 + setge r5b +%else + mov r5d, 1 + cmp eobd, 10 + sbb r5d, 0 +%endif + shl r5d, 4 + + lea r3, [rsp+gprsize] +.loop_pass1: + call t0 +%if ARCH_X86_64 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+4*32+r5], m8 + mova [cq+5*32+r5], m9 + mova [cq+6*32+r5], m10 + mova [cq+7*32+r5], m11 +%else + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+4*32+r5], m0 + mova [cq+5*32+r5], m1 + mova [cq+6*32+r5], m2 + mova [cq+7*32+r5], m3 + mova m0, [rsp+gprsize+ 8*16] + mova m2, [rsp+gprsize+ 9*16] + mova m4, [rsp+gprsize+10*16] + mova m6, [rsp+gprsize+11*16] +%endif + call m(idct_8x4_internal_16bpc).transpose4x8packed + pxor m7, m7 + REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15 + test r5d, r5d + jz .end + mova [cq+0*32+r5], m0 + mova [cq+1*32+r5], m1 + mova [cq+2*32+r5], m2 + mova [cq+3*32+r5], m3 + xor r5d, r5d + jmp .loop_pass1 +.end: + + jmp tx2q +.main: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 1*32+r5] + mova m1, [cq+ 3*32+r5] + mova m2, [cq+ 5*32+r5] + mova m3, [cq+ 7*32+r5] + mova m4, [cq+ 9*32+r5] + mova m5, [cq+11*32+r5] + mova m6, [cq+13*32+r5] + mova m7, [cq+15*32+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+ 0*32+r5] + mova m1, [cq+ 2*32+r5] + mova m2, [cq+ 4*32+r5] + mova m3, [cq+ 6*32+r5] + mova m4, [cq+ 8*32+r5] + mova m5, [cq+10*32+r5] + mova m6, [cq+12*32+r5] + mova m7, [cq+14*32+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + call m(idct_16x4_internal_16bpc).round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + ret + +.pass2: +%if ARCH_X86_32 + mov strideq, [rsp+gprsize+12*16] +%endif + mov r4d, 2 +.pass2_main: +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%endif + lea r3, [strideq*3] + jmp .loop_pass2_entry +.loop_pass2: + mova m0, [cq+0*32+ 0] + mova m1, [cq+1*32+ 0] + mova m2, [cq+2*32+ 0] + mova m3, [cq+3*32+ 0] +.loop_pass2_entry: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + call m(idct_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 + add dstq, 16 + add cq, 4*32 + dec r4d + jg .loop_pass2 + RET + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity, 6 + +cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+gprsize+12*16], r1 +%endif + lea t0, [o(.main)] + jmp m(idct_16x8_internal_16bpc).loop_main + +.main: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 2*32+r5] + mova m1, [cq+13*32+r5] + mova m2, [cq+ 6*32+r5] + mova m3, [cq+ 9*32+r5] + mova m4, [cq+10*32+r5] + mova m5, [cq+ 5*32+r5] + mova m6, [cq+14*32+r5] + mova m7, [cq+ 1*32+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(iadst_16x4_internal_16bpc).main_part1 + mova m0, [cq+ 0*32+r5] + mova m1, [cq+15*32+r5] + mova m2, [cq+ 4*32+r5] + mova m3, [cq+11*32+r5] + mova m4, [cq+ 8*32+r5] + mova m5, [cq+ 7*32+r5] + mova m6, [cq+12*32+r5] + mova m7, [cq+ 3*32+r5] +%if ARCH_X86_32 + add r3, 8*16 +%endif + call m(idct_8x4_internal_16bpc).rect2_mul +%if ARCH_X86_32 + sub r3, 8*16 +%endif + call m(iadst_16x4_internal_16bpc).main_part2 + call m(iadst_16x4_internal_16bpc).round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + ret + +.pass2: +%if ARCH_X86_32 + mov strideq, [rsp+gprsize+12*16] +%endif + mov r4d, 2 +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] + mova m11, [o(pw_m2048)] +%endif + lea r3, [strideq*3] + jmp .loop_pass2_entry +.loop_pass2: + mova m0, [cq+0*32+ 0] + mova m1, [cq+1*32+ 0] + mova m2, [cq+2*32+ 0] + mova m3, [cq+3*32+ 0] +.loop_pass2_entry: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end + call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 + add dstq, 16 + add cq, 4*32 + dec r4d + jg .loop_pass2 + RET + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity, 6 + +cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+gprsize+12*16], r1 +%endif + lea t0, [o(.main)] + jmp m(idct_16x8_internal_16bpc).loop_main +.main: + call m(iadst_16x8_internal_16bpc).main +%if ARCH_X86_64 + pshufd m1, m0, q1032 + pshufd m3, m2, q1032 + pshufd m5, m4, q1032 + pshufd m7, m6, q1032 + pshufd m0, m14, q1032 + pshufd m2, m12, q1032 + pshufd m4, m10, q1032 + pshufd m6, m8, q1032 + mova m14, m1 + mova m12, m3 + mova m10, m5 + mova m8, m7 +%else + pshufd m1, m0, q1032 + pshufd m3, m2, q1032 + pshufd m5, m4, q1032 + pshufd m7, m6, q1032 + pshufd m0, [r3+11*16], q1032 + pshufd m2, [r3+10*16], q1032 + pshufd m4, [r3+9*16], q1032 + pshufd m6, [r3+8*16], q1032 + mova [r3+8*16], m7 + mova [r3+9*16], m5 + mova [r3+10*16], m3 + mova [r3+11*16], m1 +%endif + ret + +.pass2: +%if ARCH_X86_32 + mov strideq, [rsp+gprsize+12*16] +%endif + lea dstq, [dstq+strideq*8] + neg strideq + add dstq, strideq +%if ARCH_X86_32 + mov [rsp+gprsize+12*16], strideq +%endif + jmp m(iadst_16x8_internal_16bpc).pass2 + +INV_TXFM_16X8_FN identity, dct, -54 +INV_TXFM_16X8_FN identity, adst, -54 +INV_TXFM_16X8_FN identity, flipadst, -54 +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+gprsize+12*16], r1 +%endif + lea t0, [o(.main)] + jmp m(idct_16x8_internal_16bpc).loop_main +.main: +%if ARCH_X86_64 + mova m15, [o(pd_2896)] + pmulld m0, m15, [cq+ 0*32+r5] + pmulld m1, m15, [cq+ 1*32+r5] + pmulld m2, m15, [cq+ 2*32+r5] + pmulld m3, m15, [cq+ 3*32+r5] + pmulld m4, m15, [cq+ 4*32+r5] + pmulld m5, m15, [cq+ 5*32+r5] + pmulld m6, m15, [cq+ 6*32+r5] + pmulld m7, m15, [cq+ 7*32+r5] + pmulld m8, m15, [cq+ 8*32+r5] + pmulld m9, m15, [cq+ 9*32+r5] + pmulld m10, m15, [cq+10*32+r5] + pmulld m11, m15, [cq+11*32+r5] + pmulld m12, m15, [cq+12*32+r5] + pmulld m13, m15, [cq+13*32+r5] + pmulld m14, m15, [cq+14*32+r5] + pmulld m15, [cq+15*32+r5] + mova [r3], m15 + mova m15, [o(pd_2048)] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [r3] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + mova [r3], m15 + mova m15, [o(pd_11586)] + REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pmulld m15, [r3] + mova [r3], m15 + mova m15, [o(pd_6144)] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [r3] + REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%else + mova m0, [cq+ 0*32+r5] + mova m1, [cq+ 1*32+r5] + mova m2, [cq+ 2*32+r5] + mova m3, [cq+ 3*32+r5] + mova m4, [cq+ 4*32+r5] + mova m5, [cq+ 5*32+r5] + mova m6, [cq+ 6*32+r5] + mova m7, [cq+ 7*32+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + mova [r3], m7 + mova m7, [o(pd_11586)] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulld m7, [r3] + mova [r3], m7 + mova m7, [o(pd_6144)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3] + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + mova [r3+ 8*16], m0 + mova [r3+ 9*16], m2 + mova [r3+10*16], m4 + mova [r3+11*16], m6 + mova m0, [cq+ 8*32+r5] + mova m1, [cq+ 9*32+r5] + mova m2, [cq+10*32+r5] + mova m3, [cq+11*32+r5] + mova m4, [cq+12*32+r5] + mova m5, [cq+13*32+r5] + mova m6, [cq+14*32+r5] + mova m7, [cq+15*32+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + mova [r3], m7 + mova m7, [o(pd_11586)] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulld m7, [r3] + mova [r3], m7 + mova m7, [o(pd_6144)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3] + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 +%endif + ret +.pass2: +%if ARCH_X86_32 + mov strideq, [rsp+gprsize+12*16] +%endif + mov r4d, 2 +%if ARCH_X86_64 + mova m8, [o(pw_4096)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%endif + lea r3, [strideq*3] + jmp .loop_pass2_entry +.loop_pass2: + mova m0, [cq+0*32+ 0] + mova m1, [cq+1*32+ 0] + mova m2, [cq+2*32+ 0] + mova m3, [cq+3*32+ 0] +.loop_pass2_entry: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_64 + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 +%else + mova [rsp+gprsize], m7 + mova m7, [o(pw_4096)] + call m(idct_8x8_internal_16bpc).round4_and_write_8x8 +%endif +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 + add dstq, 16 + add cq, 4*32 + dec r4d + jg .loop_pass2 + RET + +%macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16 +%else + INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 16 + add r5d, 640 + sar r5d, 10 + add rsp, (5+ARCH_X86_64*3+WIN64)*16 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity, v +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst + +cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_64 + DECLARE_REG_TMP 6, 7 +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%endif +%elif ARCH_X86_32 + DECLARE_REG_TMP 1, 6 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(.main)] +.pass1_full: +%undef cmp + mov t1d, 4 +.zero_loop: + dec t1d + cmp eobb, byte [r5+t1] + jb .zero_loop + mov r5d, t1d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, [rsp+16*16+2*gprsize] +%endif + ; setup stack pointer + lea r3, [rsp+gprsize] +.loop_pass1: + call t0 +%if ARCH_X86_64 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+4*64+r5], m8 + mova [cq+5*64+r5], m9 + mova [cq+6*64+r5], m10 + mova [cq+7*64+r5], m11 +%else + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+4*64+r5], m0 + mova [cq+5*64+r5], m1 + mova [cq+6*64+r5], m2 + mova [cq+7*64+r5], m3 + mova m0, [rsp+gprsize+ 8*16] + mova m2, [rsp+gprsize+ 9*16] + mova m4, [rsp+gprsize+10*16] + mova m6, [rsp+gprsize+11*16] +%endif + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+0*64+r5], m0 + mova [cq+1*64+r5], m1 + mova [cq+2*64+r5], m2 + mova [cq+3*64+r5], m3 + pxor m0, m0 + REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15 + sub r5d, 16 + jge .loop_pass1 + +%if ARCH_X86_32 + ; restore pic-ptr + mov r1, [rsp+16*16+1*gprsize] +%endif + jmp tx2q +.main: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + + mova m0, [cq+ 1*64+r5] + mova m1, [cq+ 3*64+r5] + mova m2, [cq+ 5*64+r5] + mova m3, [cq+ 7*64+r5] + mova m4, [cq+ 9*64+r5] + mova m5, [cq+11*64+r5] + mova m6, [cq+13*64+r5] + mova m7, [cq+15*64+r5] + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+ 0*64+r5] + mova m1, [cq+ 2*64+r5] + mova m2, [cq+ 4*64+r5] + mova m3, [cq+ 6*64+r5] + mova m4, [cq+ 8*64+r5] + mova m5, [cq+10*64+r5] + mova m6, [cq+12*64+r5] + mova m7, [cq+14*64+r5] + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + call .round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + ret +.round: +%if ARCH_X86_64 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + psrld m8, m11, 10 ; 2 + REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + mova m8, [r3+1*16] + mova m9, [r3+2*16] + mova m10, [r3+3*16] + mova m11, [r3+4*16] + mova m12, [r3+5*16] + mova m13, [r3+6*16] + mova m14, [r3+7*16] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r3+0*16] ; out8 + paddd m7, [r3+0*16] ; out7 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ; and out0-15 is now in m0-15 +%else + mova [r3+ 0*16], m0 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 + pmaxsd m0, [r3+ 0*16] + mova [r3+ 0*16], m7 + mova m7, [o(clip_18b_max)] + REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsd m7, [r3+ 0*16] + mova [r3+ 0*16], m0 + mova m0, [o(pd_2)] + REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7 + paddd m0, [r3+ 0*16] + mova [r3+ 0*16], m0 + mova [r3+ 1*16], m1 + mova [r3+ 2*16], m2 + mova m1, [r3+11*16] + mova m2, [r3+10*16] + psubd m0, m7, m1 + paddd m7, m1 + psubd m1, m6, m2 + paddd m6, m2 + REPX {psrad x, 2}, m0, m1, m6, m7 + packssdw m0, m1 ; out8-9 + packssdw m6, m7 ; out6-7 + mova [r3+11*16], m6 + mova m1, [r3+9*16] + mova m7, [r3+8*16] + psubd m2, m5, m1 + paddd m5, m1 + psubd m1, m4, m7 + paddd m4, m7 + REPX {psrad x, 2}, m2, m1, m4, m5 + packssdw m2, m1 ; out10-11 + packssdw m4, m5 ; out4-5 + mova m1, [r3+2*16] + mova [r3+10*16], m4 + mova m6, [r3+7*16] + mova m7, [r3+6*16] + psubd m4, m3, m6 + paddd m3, m6 + psubd m6, m1, m7 + paddd m1, m7 + REPX {psrad x, 2}, m4, m6, m1, m3 + packssdw m4, m6 ; out12-13 + packssdw m1, m3 ; out2-3 + mova m3, [r3+1*16] + mova [r3+9*16], m1 + mova m1, [r3+0*16] + mova m5, [r3+5*16] + mova m7, [r3+4*16] + psubd m6, m3, m5 + paddd m3, m5 + psubd m5, m1, m7 + paddd m1, m7 + REPX {psrad x, 2}, m6, m5, m1, m3 + packssdw m6, m5 ; out14-15 + packssdw m1, m3 ; out0-1 + mova [r3+8*16], m1 +%endif + ret + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] + mov r7, dstq +%else + mov [rsp+2*gprsize+16*16], dstq +%endif + lea r3, [strideq*3] + mov r4d, 2 +.loop_pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m0, [cq+0*64+ 0] + mova m1, [cq+2*64+ 0] + mova m2, [cq+0*64+16] + mova m3, [cq+2*64+16] + mova m4, [cq+0*64+32] + mova m5, [cq+2*64+32] + mova m6, [cq+0*64+48] + mova m7, [cq+2*64+48] + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+3*16], m0 + mova [rsp+gprsize+4*16], m1 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m3 + mova [rsp+gprsize+7*16], m4 + mova [rsp+gprsize+8*16], m5 + mova [rsp+gprsize+9*16], m6 + ; m7 is already stored in [rsp+gprsize+0*16] + mova m0, [cq+1*64+ 0] + mova m1, [cq+3*64+ 0] + mova m2, [cq+1*64+16] + mova m3, [cq+3*64+16] + mova m4, [cq+1*64+32] + mova m5, [cq+3*64+32] + mova m6, [cq+1*64+48] + mova m7, [cq+3*64+48] + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + + ; out0-7 is in rsp+gprsize+3-10*mmsize + ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize + +%if ARCH_X86_64 + lea dstq, [r7+strideq*8] +%else + mov dstq, [rsp+2*gprsize+16*16] + lea dstq, [dstq+strideq*8] +%endif + call m(idct_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 + mov dstq, r7 +%else + mov dstq, [rsp+2*gprsize+16*16] +%endif + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + mova m4, [rsp+gprsize+ 7*16] + mova m5, [rsp+gprsize+ 8*16] + mova m6, [rsp+gprsize+ 9*16] + mova m7, [rsp+gprsize+10*16] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 +%if ARCH_X86_64 + add r7, 16 +%define mzero m9 +%else + add dword [rsp+2*gprsize+16*16], 16 +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 + add cq, 64*4 + REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 +%undef mzero + dec r4d + jg .loop_pass2 +%if WIN64 + mov r7, [rsp+16*16+gprsize] +%endif + RET + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(.main)] + jmp m(idct_16x16_internal_16bpc).pass1_full + +.main: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 2*64+r5] + mova m1, [cq+13*64+r5] + mova m2, [cq+ 6*64+r5] + mova m3, [cq+ 9*64+r5] + mova m4, [cq+10*64+r5] + mova m5, [cq+ 5*64+r5] + mova m6, [cq+14*64+r5] + mova m7, [cq+ 1*64+r5] + call m(iadst_16x4_internal_16bpc).main_part1 + mova m0, [cq+ 0*64+r5] + mova m1, [cq+15*64+r5] + mova m2, [cq+ 4*64+r5] + mova m3, [cq+11*64+r5] + mova m4, [cq+ 8*64+r5] + mova m5, [cq+ 7*64+r5] + mova m6, [cq+12*64+r5] + mova m7, [cq+ 3*64+r5] + call m(iadst_16x4_internal_16bpc).main_part2 + call .round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + ret +.round: +%if ARCH_X86_64 + pcmpeqd m8, m8 ; -1 + mova m15, [o(pd_10240)] + psrld m14, 10 ; +2 + psubd m13, m14, m8 ; +3 + REPX {pxor x, m8 }, m1, m3, m5, m7 + REPX {paddd x, m14}, m0, m2 + REPX {paddd x, m13}, m1, m3 + REPX {paddd x, m15}, m4, m5, m6, m7 + paddd m13, m15, m8 ; +10239 + paddd m8, m15, m9 + psubd m9, m13, m10 + paddd m10, m15, m11 + psubd m11, m13, m12 + paddd m12, m14, [r3+3*16] + psubd m13, m14, [r3+2*16] + psubd m15, m14, [r3+0*16] + paddd m14, [r3+1*16] + REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 +%else + mova [r3+8*16], m1 + mova [r3+9*16], m3 + mova m3, [o(pd_10240)] + pcmpeqd m1, m1 + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m4, m5, m6, m7 + REPX {psrad x, 14}, m4, m5, m6, m7 + packssdw m4, m5 + packssdw m6, m7 + mova [r3+10*16], m4 + mova [r3+11*16], m6 + mova m4, [r3+4*16] + mova m5, [r3+5*16] + mova m6, [r3+6*16] + mova m7, [r3+7*16] + mova m3, [o(pd_2)] + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m4, m6 + psubd m3, m1 + REPX {paddd x, m3}, m5, m7 + REPX {psrad x, 2 }, m4, m5, m6, m7 + packssdw m4, m5 + packssdw m6, m7 + mova m5, [r3+8*16] + mova m7, [r3+9*16] + mova [r3+8*16], m4 + mova [r3+9*16], m6 + mova m3, [o(pd_10240)] + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m0, m5, m2, m7 + REPX {psrad x, 14}, m0, m5, m2, m7 + packssdw m0, m5 + packssdw m2, m7 + mova m4, [r3+0*16] + mova m5, [r3+1*16] + mova m6, [r3+2*16] + mova m7, [r3+3*16] + mova m3, [o(pd_2)] + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m4, m6 + psubd m3, m1 + REPX {paddd x, m3}, m5, m7 + REPX {psrad x, 2 }, m4, m5, m6, m7 + packssdw m4, m5 + packssdw m6, m7 +%endif + ret +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + mova m11, [o(pw_m2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] + mov r7, dstq +%else + mov [rsp+2*gprsize+16*16], dstq +%endif + lea r3, [strideq*3] + mov r4d, 2 +.loop_pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m0, [cq+0*64+32] + mova m1, [cq+1*64+32] + mova m2, [cq+2*64+16] + mova m3, [cq+3*64+16] + mova m4, [cq+0*64+ 0] + mova m5, [cq+1*64+ 0] + mova m6, [cq+2*64+48] + mova m7, [cq+3*64+48] + mova [rsp+gprsize+3*16], m0 + mova [rsp+gprsize+4*16], m1 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m3 + mova [rsp+gprsize+7*16], m4 + mova [rsp+gprsize+8*16], m5 + mova [rsp+gprsize+9*16], m6 + mova [rsp+gprsize+10*16], m7 + mova m0, [cq+2*64+ 0] + mova m1, [cq+3*64+ 0] + mova m2, [cq+0*64+16] + mova m3, [cq+1*64+16] + mova m4, [cq+2*64+32] + mova m5, [cq+3*64+32] + mova m6, [cq+0*64+48] + mova m7, [cq+1*64+48] + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end + + ; out0-7 is in rsp+gprsize+3-10*mmsize + ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize + +%if ARCH_X86_64 + lea dstq, [r7+strideq*8] +%else + mov dstq, [rsp+2*gprsize+16*16] + lea dstq, [dstq+strideq*8] +%endif + call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 + mov dstq, r7 +%else + mov dstq, [rsp+2*gprsize+16*16] +%endif + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + mova m4, [rsp+gprsize+ 7*16] + mova m5, [rsp+gprsize+ 8*16] + mova m6, [rsp+gprsize+ 9*16] + mova m7, [rsp+gprsize+10*16] + call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 +%if ARCH_X86_64 + add r7, 16 +%define mzero m9 +%else + add dword [rsp+2*gprsize+16*16], 16 +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 + add cq, 64*4 + REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 +%undef mzero + dec r4d + jg .loop_pass2 +%if WIN64 + mov r7, [rsp+16*16+gprsize] +%endif + RET + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(.main)] + jmp m(idct_16x16_internal_16bpc).pass1_full + +.main: + call m(iadst_16x16_internal_16bpc).main +%if ARCH_X86_64 + mova m1, m0 + mova m3, m2 + mova m5, m4 + mova m7, m6 + pshufd m0, m14, q1032 + pshufd m2, m12, q1032 + pshufd m4, m10, q1032 + pshufd m6, m8, q1032 + pshufd m8, m7, q1032 + pshufd m10, m5, q1032 + pshufd m12, m3, q1032 + pshufd m14, m1, q1032 +%else + pshufd m1, m0, q1032 + pshufd m3, m2, q1032 + pshufd m5, m4, q1032 + pshufd m7, m6, q1032 + pshufd m0, [r3+11*16], q1032 + pshufd m2, [r3+10*16], q1032 + pshufd m4, [r3+9*16], q1032 + pshufd m6, [r3+8*16], q1032 + mova [r3+11*16], m1 + mova [r3+10*16], m3 + mova [r3+ 9*16], m5 + mova [r3+ 8*16], m7 +%endif + ret + +.pass2: + lea r3, [strideq*3] + lea r3, [r3*5] + add dstq, r3 + neg strideq + jmp m(iadst_16x16_internal_16bpc).pass2 + +INV_TXFM_16X16_FN identity, dct, h +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(.main)] + jmp m(idct_16x16_internal_16bpc).pass1_full + +.main: +%if ARCH_X86_64 + mova m15, [o(pd_11586)] + pmulld m0, m15, [cq+ 0*64+r5] + pmulld m1, m15, [cq+ 1*64+r5] + pmulld m2, m15, [cq+ 2*64+r5] + pmulld m3, m15, [cq+ 3*64+r5] + pmulld m4, m15, [cq+ 4*64+r5] + pmulld m5, m15, [cq+ 5*64+r5] + pmulld m6, m15, [cq+ 6*64+r5] + pmulld m7, m15, [cq+ 7*64+r5] + pmulld m8, m15, [cq+ 8*64+r5] + pmulld m9, m15, [cq+ 9*64+r5] + pmulld m10, m15, [cq+10*64+r5] + pmulld m11, m15, [cq+11*64+r5] + pmulld m12, m15, [cq+12*64+r5] + pmulld m13, m15, [cq+13*64+r5] + pmulld m14, m15, [cq+14*64+r5] + pmulld m15, [cq+15*64+r5] + mova [r3], m15 + mova m15, [o(pd_10240)] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [r3] + REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%else + mova m7, [o(pd_11586)] + pmulld m0, m7, [cq+ 0*64+r5] + pmulld m1, m7, [cq+ 1*64+r5] + pmulld m2, m7, [cq+ 2*64+r5] + pmulld m3, m7, [cq+ 3*64+r5] + pmulld m4, m7, [cq+ 4*64+r5] + pmulld m5, m7, [cq+ 5*64+r5] + pmulld m6, m7, [cq+ 6*64+r5] + pmulld m7, [cq+ 7*64+r5] + mova [r3], m7 + mova m7, [o(pd_10240)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3] + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + mova [r3+8*16], m0 + mova [r3+9*16], m2 + mova [r3+10*16], m4 + mova [r3+11*16], m6 + mova m7, [o(pd_11586)] + pmulld m0, m7, [cq+ 8*64+r5] + pmulld m1, m7, [cq+ 9*64+r5] + pmulld m2, m7, [cq+10*64+r5] + pmulld m3, m7, [cq+11*64+r5] + pmulld m4, m7, [cq+12*64+r5] + pmulld m5, m7, [cq+13*64+r5] + pmulld m6, m7, [cq+14*64+r5] + pmulld m7, [cq+15*64+r5] + mova [r3], m7 + mova m7, [o(pd_10240)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3] + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 +%endif + ret + +.pass2: +%if ARCH_X86_64 + mova m4, [o(pw_2048)] + mova m5, [o(pixel_10bpc_max)] + pxor m6, m6 + mova m7, [o(pw_1697x16)] + mov r7, dstq +%else + mov [rsp+2*gprsize+16*16], dstq +%endif + mov r5d, 4 + lea r3, [strideq*3] +.pass2_loop: + mova m0, [cq+0*64+0] + mova m1, [cq+1*64+0] + mova m2, [cq+2*64+0] + mova m3, [cq+3*64+0] + call m(iidentity_8x16_internal_16bpc).main +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).round1_and_write_8x4 +%else + call m(idct_8x4_internal_16bpc).round2_and_write_8x4 +%endif + REPX {mova [cq+x*16], m6}, 0, 4, 8, 12 + add cq, 16 + lea dstq, [dstq+strideq*4] + dec r5w + jg .pass2_loop + add cq, 64*3 + btc r5d, 16 + jc .end +%if ARCH_X86_64 + lea dstq, [r7+16] +%else + mov dstq, [rsp+2*gprsize+16*16] + add dstq, 16 +%endif + add r5d, 4 + jmp .pass2_loop +.end: +%if WIN64 + mov r7, [rsp+16*16+gprsize] +%endif + RET + +cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%endif + mova m5, [o(pw_5)] + mova m7, [o(pixel_10bpc_max)] + pxor m6, m6 + mov r5d, eobd + add eobb, 21 + cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192 + lea r4, [strideq*3] +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {paddsw x, m5}, m0, m1, m2, m3 + REPX {psraw x, 3 }, m0, m1, m2, m3 + call .main_zero + add cq, 16 + lea dstq, [dstq+strideq*4] + btc eobd, 16 + jnc .loop + sub eobd, 64 + jge .loop + RET +ALIGN function_align +.main_zero: + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m1 + punpcklwd m2, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r4 ] + REPX {pmaxsw x, m6}, m0, m1, m2, m3 + REPX {pminsw x, m7}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r4 ], m3 + ret + +cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%endif + mova m5, [o(pw_4096)] + mova m7, [o(pixel_10bpc_max)] + pxor m6, m6 + mov r4d, eobd + add eobb, 21 + cmovc eobd, r4d + lea r4, [strideq*3] + mov r5, dstq +.loop: + mova m0, [cq+32*0] + packssdw m0, [cq+32*1] + mova m1, [cq+32*2] + packssdw m1, [cq+32*3] + mova m2, [cq+32*4] + packssdw m2, [cq+32*5] + mova m3, [cq+32*6] + packssdw m3, [cq+32*7] + REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + call m(inv_txfm_add_identity_identity_8x32_16bpc).main + lea dstq, [dstq+strideq*4] + add cq, 16 + btc eobd, 16 + jnc .loop + add cq, 32*8-32 + add r5, 16 + mov dstq, r5 + sub eobd, 64 + jge .loop + RET + +cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%else + mova m8, [o(pw_2896x8)] + mova m9, [o(pw_1697x16)] + mova m11, [o(pw_8192)] +%endif + mova m7, [o(pixel_10bpc_max)] + lea r4, [strideq*3] + pxor m6, m6 +%if ARCH_X86_64 + paddw m10, m11, m11 ; pw_16384 +%endif + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 128*8-32 + lea dstq, [r5+16] + call .main + sub cq, 128*8 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 128*8-32 + lea dstq, [r5+16] + call .main + sub cq, 128*8 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 128*8-32 + lea dstq, [r5+16] + call .main + sub cq, 128*8 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 128*8-32 + lea dstq, [r5+16] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] +%if ARCH_X86_64 + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + pmulhrsw m4, m9, m0 + pmulhrsw m5, m9, m1 + REPX {pmulhrsw x, m10}, m4, m5 +%else + mova m6, [o(pw_2896x8)] + REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 + mova m5, [o(pw_1697x16)] + pmulhrsw m4, m5, m0 + pmulhrsw m5, m1 + mova m6, [o(pw_16384)] + REPX {pmulhrsw x, m6 }, m4, m5 +%endif + paddsw m0, m4 + paddsw m1, m5 +%if ARCH_X86_64 + pmulhrsw m4, m9, m2 + pmulhrsw m5, m9, m3 + REPX {pmulhrsw x, m10}, m4, m5 +%else + mova m5, [o(pw_1697x16)] + pmulhrsw m4, m5, m2 + pmulhrsw m5, m3 + REPX {pmulhrsw x, m6 }, m4, m5 +%endif + paddsw m2, m4 + paddsw m3, m5 +%if ARCH_X86_64 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 +%else + psrlw m6, 1 ; pw_8192 + REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 + pxor m6, m6 +%endif + call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero + lea dstq, [dstq+strideq*4] + add cq, 16 + btc eobd, 16 + jnc .main + ret + +cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%else + mova m8, [o(pw_2896x8)] + mova m9, [o(pw_1697x16)] + mova m10, [o(pw_2048)] +%endif + mova m7, [o(pixel_10bpc_max)] + lea r4, [strideq*3] + pxor m6, m6 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + call .main + add cq, 64*8-64 + lea dstq, [r5+16*1] + call .main + sub eobd, 107 ; eob < 143 + jl .ret + call .main + add cq, 64*8-64 + lea dstq, [r5+16*2] + call .main + sub eobd, 128 ; eob < 271 + jl .ret + call .main + add cq, 64*8-64 + lea dstq, [r5+16*3] + call .main + sub eobd, 128 ; eob < 399 + jl .ret + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] +%if ARCH_X86_64 + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 +%else + mova m6, [o(pw_2896x8)] + REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 +%endif + REPX {paddsw x, x }, m0, m1, m2, m3 +%if ARCH_X86_64 + pmulhrsw m4, m9, m0 + pmulhrsw m5, m9, m1 +%else + mova m6, [o(pw_1697x16)] + pmulhrsw m4, m6, m0 + pmulhrsw m5, m6, m1 +%endif + REPX {paddsw x, x }, m0, m1 + paddsw m0, m4 + paddsw m1, m5 +%if ARCH_X86_64 + pmulhrsw m4, m9, m2 + pmulhrsw m5, m9, m3 +%else + pmulhrsw m4, m6, m2 + pmulhrsw m6, m3 +%endif + REPX {paddsw x, x }, m2, m3 + paddsw m2, m4 +%if ARCH_X86_64 + paddsw m3, m5 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3 +%else + paddsw m3, m6 + mova m6, [o(pw_2048)] + REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 + pxor m6, m6 +%endif + REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(inv_txfm_add_identity_identity_8x32_16bpc).main + lea dstq, [dstq+strideq*4] + add cq, 16 + btc eobd, 16 + jnc .main + ret + +cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob +%undef cmp +%if ARCH_X86_32 + LEA r6, $$ +%endif + mova m5, [o(pw_8192)] + mova m7, [o(pixel_10bpc_max)] + pxor m6, m6 + lea r4, [strideq*3] + mov r5, dstq + call .main ; 0 + cmp eobd, 36 + jl .ret + add cq, 128*8-32 ; 0 1 + lea dstq, [r5+16] ; 1 + call .main + call .main2 + cmp eobd, 136 + jl .ret + add cq, 128*16-64 ; 0 1 2 + lea dstq, [r5+16*2] ; 1 2 + call .main ; 2 + call .main2 + call .main2 + cmp eobd, 300 + jl .ret + add cq, 128*24-96 ; 0 1 2 3 + add r5, 16*3 ; 1 2 3 + mov dstq, r5 ; 2 3 + call .main ; 3 + call .main2 + call .main2 + call .main2 + cmp eobd, 535 + jl .ret + add cq, 128*24-96 ; 0 1 2 3 + lea dstq, [r5+strideq*8] ; 1 2 3 4 + mov r5, dstq ; 2 3 4 + call .main ; 3 4 + call .main2 + call .main2 + cmp eobd, 755 + jl .ret + add cq, 128*16-64 ; 0 1 2 3 + lea dstq, [r5+strideq*8] ; 1 2 3 4 + mov r5, dstq ; 2 3 4 5 + call .main ; 3 4 5 + call .main2 + cmp eobd, 911 + jl .ret + add cq, 128*8-32 ; 0 1 2 3 + lea dstq, [r5+strideq*8] ; 1 2 3 4 + call .main ; 2 3 4 5 +.ret: ; 3 4 5 6 + RET +ALIGN function_align +.main2: + sub cq, 128*8 + sub dstq, 16 +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero + lea dstq, [dstq+strideq*4] + add cq, 16 + btc eobd, 16 + jnc .main + ret + +cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \ + dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%define base $$ + DECLARE_REG_TMP 0, 4 +%else + lea r6, [tbl_Nx32_odd_offset] +%define base tbl_Nx32_odd_offset + DECLARE_REG_TMP 4, 7 +%if WIN64 + mov [rsp+gprsize*1+35*16], r7 +%endif +%endif +%define o2(x) r6-base+x + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + mov [rsp+gprsize*1+35*16], r0 +%endif +%undef cmp + ; remove entirely-zero iterations + mov r5d, 7*2 + cmp eobw, word [o2(tbl_8x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + mova [rsp+ 3*16+r5*8], m0 + mova [rsp+11*16+r5*8], m0 + mova [rsp+ 3*16+t0*8], m0 + mova [rsp+ 3*16+t1*8], m0 + sub r5d, 2 + cmp eobw, word [o2(tbl_8x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + ; actual first pass after skipping all-zero data + mov [rsp+gprsize*0+35*16], eobd + mov r3, rsp +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+0*128+r5*8] + mova m1, [cq+1*128+r5*8] + mova m2, [cq+2*128+r5*8] + mova m3, [cq+3*128+r5*8] + mova m4, [cq+4*128+r5*8] + mova m5, [cq+5*128+r5*8] + mova m6, [cq+6*128+r5*8] + mova m7, [cq+7*128+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1 + mova m1, [o(pd_2)] + REPX {paddd x, m1}, m0, m6, m5, m3 + call m(idct_8x4_internal_16bpc).round + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + call m(idct_8x4_internal_16bpc).transpose4x8packed + + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + mova [r3+ 3*16+r5*8], m0 + mova [r3+11*16+r5*8], m2 + mova [r3+ 3*16+t1*8], m1 + mova [r3+ 3*16+t0*8], m3 + pxor m7, m7 + REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + sub r5d, 2 + jge .loop_pass1 + + ; pass 2 code starts here + ; m0 is already loaded from last iteration of first pass +%if ARCH_X86_32 + mov r0, [rsp+gprsize*1+35*16] +%endif + mov eobd, [rsp+gprsize*0+35*16] + cmp eobd, 43 + jl .load_veryfast + cmp eobd, 107 + jl .load_fast + ; load normal + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] + jmp .run +.load_fast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + jmp .run +.load_veryfast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + ; fall-through +.run: + call .pass2 +%if WIN64 + mov r7, [rsp+gprsize*1+35*16] +%endif + RET + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m1, [rsp+gprsize+16* 4] + mova m2, [rsp+gprsize+16* 5] + mova m3, [rsp+gprsize+16* 6] + mova m4, [rsp+gprsize+16* 7] + mova m5, [rsp+gprsize+16* 8] + mova m6, [rsp+gprsize+16* 9] + mova m7, [rsp+gprsize+16*10] + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+ 3*16], m0 + mova [rsp+gprsize+ 4*16], m1 + mova [rsp+gprsize+ 5*16], m2 + mova [rsp+gprsize+ 6*16], m3 + mova [rsp+gprsize+ 7*16], m4 + mova [rsp+gprsize+ 8*16], m5 + mova [rsp+gprsize+ 9*16], m6 + mova m0, [rsp+gprsize+11*16] + mova m1, [rsp+gprsize+12*16] + mova m2, [rsp+gprsize+13*16] + mova m3, [rsp+gprsize+14*16] + mova m4, [rsp+gprsize+15*16] + mova m5, [rsp+gprsize+16*16] + mova m6, [rsp+gprsize+17*16] + mova m7, [rsp+gprsize+18*16] + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + mova m7, [rsp+gprsize+ 0*16] + mova [rsp+gprsize+11*16], m0 + mova [rsp+gprsize+12*16], m1 + mova [rsp+gprsize+13*16], m2 + mova [rsp+gprsize+14*16], m3 + mova [rsp+gprsize+15*16], m4 + mova [rsp+gprsize+16*16], m5 + mova [rsp+gprsize+17*16], m6 + mova [rsp+gprsize+18*16], m7 + call r4 +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%endif + lea r3, [strideq*3] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + lea dstq, [dstq+strideq*8] + mova m0, [rsp+gprsize+11*16] + mova m1, [rsp+gprsize+12*16] + mova m2, [rsp+gprsize+13*16] + mova m3, [rsp+gprsize+14*16] + mova m4, [rsp+gprsize+15*16] + mova m5, [rsp+gprsize+16*16] + mova m6, [rsp+gprsize+17*16] + mova m7, [rsp+gprsize+18*16] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + lea dstq, [dstq+strideq*8] + mova m0, [rsp+gprsize+19*16] + mova m1, [rsp+gprsize+20*16] + mova m2, [rsp+gprsize+21*16] + mova m3, [rsp+gprsize+22*16] + mova m4, [rsp+gprsize+23*16] + mova m5, [rsp+gprsize+24*16] + mova m6, [rsp+gprsize+25*16] + mova m7, [rsp+gprsize+26*16] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + lea dstq, [dstq+strideq*8] + mova m0, [rsp+gprsize+27*16] + mova m1, [rsp+gprsize+28*16] + mova m2, [rsp+gprsize+29*16] + mova m3, [rsp+gprsize+30*16] + mova m4, [rsp+gprsize+31*16] + mova m5, [rsp+gprsize+32*16] + mova m6, [rsp+gprsize+33*16] + mova m7, [rsp+gprsize+34*16] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + ret +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 8 + add r5d, 640 + sar r5d, 10 + add rsp, (31+2*ARCH_X86_64)*16 + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2 + +cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + mov [rsp+gprsize*1+76*16], r0 +%elif WIN64 + mov [rsp+gprsize*1+76*16], r7 +%endif +%undef cmp + ; remove entirely-zero iterations + mov r5d, 7*2 + cmp eobw, word [o2(tbl_16x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + mova [rsp+12*16+r5*8], m0 + mova [rsp+20*16+r5*8], m0 + mova [rsp+12*16+t0*8], m0 + mova [rsp+12*16+t1*8], m0 + mova [rsp+44*16+r5*8], m0 + mova [rsp+52*16+r5*8], m0 + mova [rsp+44*16+t0*8], m0 + mova [rsp+44*16+t1*8], m0 + sub r5d, 2 + cmp eobw, word [o2(tbl_16x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + ; actual first pass after skipping all-zero data + mov [rsp+gprsize*0+76*16], eobd + mov r3, rsp +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 1*128+r5*8] + mova m1, [cq+ 3*128+r5*8] + mova m2, [cq+ 5*128+r5*8] + mova m3, [cq+ 7*128+r5*8] + mova m4, [cq+ 9*128+r5*8] + mova m5, [cq+11*128+r5*8] + mova m6, [cq+13*128+r5*8] + mova m7, [cq+15*128+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+ 0*128+r5*8] + mova m1, [cq+ 2*128+r5*8] + mova m2, [cq+ 4*128+r5*8] + mova m3, [cq+ 6*128+r5*8] + mova m4, [cq+ 8*128+r5*8] + mova m5, [cq+10*128+r5*8] + mova m6, [cq+12*128+r5*8] + mova m7, [cq+14*128+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + call m(idct_16x4_internal_16bpc).round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + call m(idct_8x4_internal_16bpc).transpose4x8packed + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 +%if ARCH_X86_64 + mova [rsp+12*16+r5*8], m0 + mova [rsp+20*16+r5*8], m2 + mova [rsp+12*16+t1*8], m1 + mova [rsp+12*16+t0*8], m3 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+44*16+r5*8], m8 + mova [rsp+52*16+r5*8], m10 + mova [rsp+44*16+t1*8], m9 + mova [rsp+44*16+t0*8], m11 +%else + mova [rsp+44*16+r5*8], m0 + mova [rsp+52*16+r5*8], m2 + mova [rsp+44*16+t1*8], m1 + mova [rsp+44*16+t0*8], m3 + mova m0, [r3+ 8*16] + mova m2, [r3+ 9*16] + mova m4, [r3+10*16] + mova m6, [r3+11*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+12*16+r5*8], m0 + mova [rsp+20*16+r5*8], m2 + mova [rsp+12*16+t1*8], m1 + mova [rsp+12*16+t0*8], m3 +%endif + pxor m7, m7 + REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 + add rsp, 9*16 +%if ARCH_X86_64 + mov r6, dstq +%else + mov dstq, [rsp+gprsize*1+67*16] +%endif + mov eobd, [rsp+gprsize*0+67*16] + cmp eobd, 44 + jl .load_veryfast + cmp eobd, 151 + jl .load_fast + ; load normal + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] + jmp .run +.load_fast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + jmp .run +.load_veryfast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + ; fall-through +.run: +%if ARCH_X86_64 + lea r2, [dstq+32] + mov r7, -4 +%else + lea r2, [rsp+67*16] + mov dword [r2+0*gprsize], 2 +%endif + jmp .loop_pass2_entry +.loop_pass2: + mova m0, [rsp+16* 3] +.loop_pass2_entry: +%if ARCH_X86_32 + mov dstq, [r2+1*gprsize] +%endif + call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2 + add rsp, 32*16 +%if ARCH_X86_64 + add r7, 2 + lea dstq, [r2+r7*8] + jl .loop_pass2 +%if WIN64 + mov r7, [rsp+gprsize*1+3*16] +%endif +%else + add dword [r2+1*gprsize], 16 + dec dword [r2+0*gprsize] + jg .loop_pass2 +%endif +%assign stack_size (stack_size-73*16) +%if STACK_ALIGNMENT >= 16 +%assign stack_size_padded (stack_size_padded-73*16) +%assign stack_offset (stack_offset-73*16) +%else +%xdefine rstkm [rsp + stack_size] +%endif + RET +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 32 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add rsp, (65+4*ARCH_X86_64)*16 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly + +cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ + dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%endif + test eobd, eobd + jz .dconly + + ; remove entirely-zero iterations +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 10 + setge r5b +%else + mov r5d, 1 + cmp eobd, 10 + sbb r5d, 0 +%endif + add r5d, r5d + + ; actual first pass after skipping all-zero data +.loop_pass1: + mova m0, [cq+32* 1+r5*8] + mova m1, [cq+32* 7+r5*8] + mova m2, [cq+32* 9+r5*8] + mova m3, [cq+32*15+r5*8] + mova m4, [cq+32*17+r5*8] + mova m5, [cq+32*23+r5*8] + mova m6, [cq+32*25+r5*8] + mova m7, [cq+32*31+r5*8] +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mov r3, rsp + call .main_oddhalf_part1 + mova m0, [cq+32* 3+r5*8] + mova m1, [cq+32* 5+r5*8] + mova m2, [cq+32*11+r5*8] + mova m3, [cq+32*13+r5*8] + mova m4, [cq+32*19+r5*8] + mova m5, [cq+32*21+r5*8] + mova m6, [cq+32*27+r5*8] + mova m7, [cq+32*29+r5*8] + call .main_oddhalf_part2 + mova m0, [cq+32* 2+r5*8] + mova m1, [cq+32* 6+r5*8] + mova m2, [cq+32*10+r5*8] + mova m3, [cq+32*14+r5*8] + mova m4, [cq+32*18+r5*8] + mova m5, [cq+32*22+r5*8] + mova m6, [cq+32*26+r5*8] + mova m7, [cq+32*30+r5*8] + add r3, 16*(16+4*ARCH_X86_32) + call m(idct_16x4_internal_16bpc).main_oddhalf + mova m0, [cq+32* 0+r5*8] + mova m1, [cq+32* 4+r5*8] + mova m2, [cq+32* 8+r5*8] + mova m3, [cq+32*12+r5*8] + mova m4, [cq+32*16+r5*8] + mova m5, [cq+32*20+r5*8] + mova m6, [cq+32*24+r5*8] + mova m7, [cq+32*28+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + sub r3, 16*(16+4*ARCH_X86_32) + call .round_dct32 +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).transpose4x8packed + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+32* 8+r5*8], m8 + mova [cq+32* 9+r5*8], m9 + mova [cq+32*10+r5*8], m10 + mova [cq+32*11+r5*8], m11 + mova m8, [r3+16* 9] ; 8 9 + mova m10, [r3+16*11] ; 10 11 + mova m12, [r3+16*13] ; 12 13 + mova m14, [r3+16*15] ; 14 15 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+32* 4+r5*8], m8 + mova [cq+32* 5+r5*8], m9 + mova [cq+32* 6+r5*8], m10 + mova [cq+32* 7+r5*8], m11 + mova m8, [r3+16* 8] ; 24 25 + mova m10, [r3+16*10] ; 26 27 + mova m12, [r3+16*12] ; 28 29 + mova m14, [r3+16*14] ; 30 31 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+32*12+r5*8], m8 + mova [cq+32*13+r5*8], m9 + mova [cq+32*14+r5*8], m10 + mova [cq+32*15+r5*8], m11 +%else + sub r3, 8*16 + mova m0, [r3+ 8*16] + mova m2, [r3+10*16] + mova m4, [r3+12*16] + mova m6, [r3+14*16] + packssdw m0, [r3+ 9*16] + packssdw m2, [r3+11*16] + packssdw m4, [r3+13*16] + packssdw m6, [r3+15*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+32* 4+r5*8], m0 + mova [cq+32* 5+r5*8], m1 + mova [cq+32* 6+r5*8], m2 + mova [cq+32* 7+r5*8], m3 + mova m0, [r3+16*16] + mova m2, [r3+18*16] + mova m4, [r3+20*16] + mova m6, [r3+22*16] + packssdw m0, [r3+17*16] + packssdw m2, [r3+19*16] + packssdw m4, [r3+21*16] + packssdw m6, [r3+23*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+32* 8+r5*8], m0 + mova [cq+32* 9+r5*8], m1 + mova [cq+32*10+r5*8], m2 + mova [cq+32*11+r5*8], m3 + mova m0, [r3+31*16] + mova m2, [r3+29*16] + mova m4, [r3+27*16] + mova m6, [r3+25*16] + packssdw m0, [r3+30*16] + packssdw m2, [r3+28*16] + packssdw m4, [r3+26*16] + packssdw m6, [r3+24*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+32*12+r5*8], m0 + mova [cq+32*13+r5*8], m1 + mova [cq+32*14+r5*8], m2 + mova [cq+32*15+r5*8], m3 + mova m0, [r3+ 0*16] + mova m2, [r3+ 2*16] + mova m4, [r3+ 4*16] + mova m6, [r3+ 6*16] + packssdw m0, [r3+ 1*16] + packssdw m2, [r3+ 3*16] + packssdw m4, [r3+ 5*16] + packssdw m6, [r3+ 7*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + pxor m7, m7 + ; clear lower half of [cq] + REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + test r5d, r5d + jz .end_pass1 + mova [cq+32* 0+r5*8], m0 + mova [cq+32* 1+r5*8], m1 + mova [cq+32* 2+r5*8], m2 + mova [cq+32* 3+r5*8], m3 + sub r5d, 2 + jmp .loop_pass1 +.end_pass1: + + ; pass=2, we need to call this otherwise the stack pointer has + ; the wrong offset in the 8-bit code + mov r4d, 4 + call m(idct_16x8_internal_16bpc).pass2_main + RET + +.main_oddhalf_part1_fast: ; lower half zero + pmulld m7, m0, [o(pd_4091)] + pmulld m0, [o(pd_201)] + pmulld m4, m3, [o(pd_m2751)] +%if ARCH_X86_32 + pmulld m3, [o(pd_3035)] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m7 + REPX {psrad x, 12}, m0, m7 + mova [r3+3*16], m7 + mova m7, m3 + mova m3, m5 +%else + pmulld m3, [o(pd_3035)] +%endif + pmulld m6, m1, [o(pd_m1380)] + pmulld m1, [o(pd_3857)] + pmulld m5, m2, [o(pd_3703)] + pmulld m2, [o(pd_1751)] + jmp .main_oddhalf_part1_fast2 +.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 +%if ARCH_X86_64 + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a +.main_oddhalf_part1_fast2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m8, m0, m4 ; t17 + paddd m0, m4 ; t16 + psubd m4, m6, m2 ; t18 + paddd m6, m2 ; t19 + psubd m2, m1, m5 ; t29 + paddd m1, m5 ; t28 + psubd m5, m7, m3 ; t30 + paddd m7, m3 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + mova m15, [o(pd_4017)] + mova m10, [o(pd_799)] + ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a + psubd m3, m0, m6 ; t19a + paddd m0, m6 ; t16a + psubd m6, m7, m1 ; t28a + paddd m7, m1 ; t31a + psubd m1, m5, m4 ; t18 + paddd m5, m4 ; t17 + psubd m4, m8, m2 ; t29 + paddd m8, m2 ; t30 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + mova m15, [o(pd_3784)] + mova m10, [o(pd_1567)] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a + ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 + mova [r3+16*0], m0 + mova [r3+16*1], m5 + mova [r3+16*2], m4 + mova [r3+16*3], m6 + mova [r3+16*4], m3 + mova [r3+16*5], m1 + mova [r3+16*6], m8 + mova [r3+16*7], m7 +%else + mova [r3+0*16], m2 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m5 + mova m3, [o(pd_2048)] + ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a + mova m4, [r3+2*16] + mova m5, [r3+3*16] + mova [r3+2*16], m6 + mova [r3+3*16], m7 + mova m2, [r3+0*16] + mova m7, [r3+1*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a + mova m0, [r3+0*16] + mova m1, [r3+1*16] + mova m6, [r3+2*16] +.main_oddhalf_part1_fast2: + REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7 + REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7 + psubd m3, m0, m4 ; t17 + mova [r3+0*16], m3 + mova m3, [r3+3*16] + paddd m0, m4 ; t16 + psubd m4, m6, m2 ; t18 + paddd m6, m2 ; t19 + psubd m2, m1, m5 ; t29 + paddd m1, m5 ; t28 + psubd m5, m3, m7 ; t30 + paddd m7, m3 ; t31 + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 + pmaxsd m3, [r3+0*16] + mova [r3+0*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 + pminsd m3, [r3+0*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m6 + mova [r3+3*16], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a + ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a + psubd m1, m5, m4 ; t18 + paddd m5, m4 ; t17 + psubd m4, m3, m2 ; t29 + paddd m3, m2 ; t30 + mova m0, [r3+0*16] + mova m2, [r3+1*16] + mova m6, [r3+2*16] + mova m7, [r3+3*16] + mova [r3+0*16], m3 + psubd m3, m0, m6 ; t19a + paddd m0, m6 ; t16a + psubd m6, m7, m2 ; t28a + paddd m7, m2 ; t31a + mova m2, [o(clip_18b_min)] + REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 + pmaxsd m2, [r3+0*16] + mova [r3+0*16], m2 + mova m2, [o(clip_18b_max)] + REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 + pminsd m2, [r3+0*16] + mova [r3+16*0], m0 + mova [r3+16*1], m5 + mova [r3+16*6], m2 + mova [r3+16*7], m7 + mova m7, [o(pd_2048)] + ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a + ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28 + mova [r3+16*2], m4 + mova [r3+16*3], m6 + mova [r3+16*4], m3 + mova [r3+16*5], m1 +%endif + ret +.main_oddhalf_part2_fast: ; lower half zero + pmulld m7, m0, [o(pd_m601)] + pmulld m0, [o(pd_4052)] + pmulld m4, m3, [o(pd_3290)] +%if ARCH_X86_32 + pmulld m3, [o(pd_2440)] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m7 + REPX {psrad x, 12}, m0, m7 + mova [r3+11*16], m7 + mova m7, m3 + mova m3, m5 +%else + pmulld m3, [o(pd_2440)] +%endif + pmulld m6, m1, [o(pd_3973)] + pmulld m1, [o(pd_995)] + pmulld m5, m2, [o(pd_m2106)] + pmulld m2, [o(pd_3513)] + jmp .main_oddhalf_part2_fast2 +.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 +%if ARCH_X86_64 + ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a + ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a +.main_oddhalf_part2_fast2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m8, m0, m4 ; t25 + paddd m0, m4 ; t24 + psubd m4, m6, m2 ; t26 + paddd m6, m2 ; t27 + psubd m2, m1, m5 ; t21 + paddd m1, m5 ; t20 + psubd m5, m7, m3 ; t22 + paddd m7, m3 ; t23 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + mova m15, [o(pd_2276)] + mova m10, [o(pd_3406)] + ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a + psubd m3, m0, m6 ; t27a + paddd m0, m6 ; t24a + psubd m6, m7, m1 ; t20a + paddd m7, m1 ; t23a + psubd m1, m5, m4 ; t21 + paddd m5, m4 ; t22 + psubd m4, m8, m2 ; t26 + paddd m8, m2 ; t25 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + mova m15, [o(pd_3784)] + mova m10, [o(pd_1567)] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 + mova m9, [r3+16*0] ; t16a + mova m10, [r3+16*1] ; t17 + psubd m2, m9, m7 ; t23 + paddd m9, m7 ; t16 + psubd m7, m10, m5 ; t22a + paddd m10, m5 ; t17a + REPX {pmaxsd x, m12}, m9, m10, m2, m7 + REPX {pminsd x, m13}, m9, m10, m2, m7 + mova [r3+16*0], m9 + mova [r3+16*1], m10 + mova m9, [r3+16*2] ; t18a + mova m10, [r3+16*3] ; t19 + psubd m5, m9, m1 ; t21 + paddd m9, m1 ; t18 + psubd m1, m10, m6 ; t20a + paddd m10, m6 ; t19a + REPX {pmaxsd x, m12}, m9, m10, m5, m1 + REPX {pminsd x, m13}, m9, m10, m5, m1 + mova [r3+16*2], m9 + mova [r3+16*3], m10 + mova m9, [r3+16*4] ; t28 + mova m10, [r3+16*5] ; t29a + psubd m6, m9, m3 ; t27a + paddd m9, m3 ; t28a + psubd m3, m10, m4 ; t26 + paddd m10, m4 ; t29 + REPX {pmaxsd x, m12}, m9, m10, m6, m3 + REPX {pminsd x, m13}, m9, m10, m6, m3 + REPX {pmulld x, m14}, m6, m3, m1, m5 + paddd m6, m11 + paddd m3, m11 + psubd m4, m6, m1 ; t20 + paddd m6, m1 ; t27 + psubd m1, m3, m5 ; t21a + paddd m3, m5 ; t26a + REPX {psrad x, 12 }, m4, m1, m3, m6 + mova [r3+16*4], m4 + mova [r3+16*5], m1 + mova m4, [r3+16*6] ; t30 + mova m1, [r3+16*7] ; t31a + psubd m5, m4, m8 ; t25a + paddd m4, m8 ; t30a + psubd m8, m1, m0 ; t24 + paddd m1, m0 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m1 + REPX {pminsd x, m13}, m8, m5, m4, m1 + REPX {pmulld x, m14}, m5, m8, m7, m2 + paddd m5, m11 + paddd m8, m11 + psubd m0, m5, m7 ; t22 + paddd m5, m7 ; t25 + psubd m7, m8, m2 ; t23a + paddd m2, m8 ; t24a + REPX {psrad x, 12 }, m0, m7, m2, m5 + mova [r3+16*6], m0 + mova [r3+16*7], m7 + mova [r3+16*8], m2 + mova [r3+16*9], m5 + mova [r3+16*10], m3 + mova [r3+16*11], m6 + mova [r3+16*12], m9 + mova [r3+16*13], m10 + mova [r3+16*14], m4 + mova [r3+16*15], m1 +%else + mova [r3+ 8*16], m2 + mova [r3+ 9*16], m3 + mova [r3+10*16], m4 + mova [r3+11*16], m5 + mova m3, [o(pd_2048)] + ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a + mova m2, [r3+ 8*16] + mova m4, [r3+10*16] + mova m5, [r3+11*16] + mova [r3+ 8*16], m0 + mova [r3+10*16], m6 + mova [r3+11*16], m7 + mova m7, [r3+ 9*16] + mova [r3+ 9*16], m1 + ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a + mova m0, [r3+ 8*16] + mova m1, [r3+ 9*16] + mova m6, [r3+10*16] +.main_oddhalf_part2_fast2: + REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6 + REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6 + psubd m3, m0, m4 ; t25 + mova [r3+ 8*16], m3 + mova m3, [r3+11*16] + paddd m0, m4 ; t24 + psubd m4, m6, m2 ; t26 + paddd m6, m2 ; t27 + psubd m2, m1, m5 ; t21 + paddd m1, m5 ; t20 + psubd m5, m3, m7 ; t22 + paddd m7, m3 ; t23 + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 + pmaxsd m3, [r3+ 8*16] + mova [r3+ 8*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 + pminsd m3, [r3+ 8*16] + mova [r3+ 8*16], m0 + mova [r3+ 9*16], m1 + mova [r3+10*16], m6 + mova [r3+11*16], m7 + mova m7, [o(pd_2048)] + ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a + ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a + psubd m1, m5, m4 ; t21 + paddd m5, m4 ; t22 + psubd m4, m3, m2 ; t26 + paddd m3, m2 ; t25 + mova m0, [r3+ 8*16] + mova m2, [r3+ 9*16] + mova m6, [r3+10*16] + mova m7, [r3+11*16] + mova [r3+ 8*16], m3 + psubd m3, m0, m6 ; t27a + paddd m0, m6 ; t24a + psubd m6, m7, m2 ; t20a + paddd m7, m2 ; t23a + mova m2, [o(clip_18b_min)] + REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 + pmaxsd m2, [r3+ 8*16] + mova [r3+ 8*16], m2 + mova m2, [o(clip_18b_max)] + REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 + pminsd m2, [r3+ 8*16] + mova [r3+ 8*16], m0 + mova [r3+ 9*16], m2 + mova [r3+14*16], m5 + mova [r3+15*16], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20 + mova [r3+10*16], m3 + mova m0, [o(clip_18b_min)] + mova m2, [o(clip_18b_max)] + mova m5, [r3+16*2] ; t18a + mova m7, [r3+16*3] ; t19 + psubd m3, m5, m1 ; t21 + paddd m5, m1 ; t18 + psubd m1, m7, m6 ; t20a + paddd m7, m6 ; t19a + REPX {pmaxsd x, m0}, m5, m7, m3, m1 + REPX {pminsd x, m2}, m5, m7, m3, m1 + mova [r3+16*2], m5 + mova [r3+16*3], m7 + mova [r3+11*16], m3 + mova m3, [r3+10*16] + mova m5, [r3+16*4] ; t28 + mova m7, [r3+16*5] ; t29a + psubd m6, m5, m3 ; t27a + paddd m5, m3 ; t28a + psubd m3, m7, m4 ; t26 + paddd m7, m4 ; t29 + REPX {pmaxsd x, m0}, m5, m7, m6, m3 + REPX {pminsd x, m2}, m5, m7, m6, m3 + mova [r3+16*12], m5 + mova [r3+16*13], m7 + mova m5, [o(pd_2048)] + mova m7, [o(pd_2896)] + mova m4, [r3+11*16] + REPX {pmulld x, m7}, m6, m3, m1, m4 + paddd m6, m5 + paddd m3, m5 + psubd m5, m6, m1 ; t20 + paddd m6, m1 ; t27 + psubd m1, m3, m4 ; t21a + paddd m3, m4 ; t26a + REPX {psrad x, 12}, m5, m1, m3, m6 + mova [r3+16*4], m5 + mova [r3+16*5], m1 + mova [r3+16*10], m3 + mova [r3+16*11], m6 + + mova m5, [r3+14*16] + mova m6, [r3+15*16] + mova m3, [r3+16*0] ; t16a + mova m4, [r3+16*1] ; t17 + psubd m1, m3, m6 ; t23 + paddd m3, m6 ; t16 + psubd m6, m4, m5 ; t22a + paddd m4, m5 ; t17a + REPX {pmaxsd x, m0}, m3, m4, m1, m6 + REPX {pminsd x, m2}, m3, m4, m1, m6 + mova [r3+16*0], m3 + mova [r3+16*1], m4 + mova m5, [r3+ 8*16] + mova m3, [r3+ 9*16] + mova [r3+ 8*16], m1 + mova [r3+ 9*16], m6 + mova m4, [r3+16*6] ; t30 + mova m1, [r3+16*7] ; t31a + psubd m6, m1, m5 ; t24 + paddd m1, m5 ; t31 + psubd m5, m4, m3 ; t25a + paddd m4, m3 ; t30a + REPX {pmaxsd x, m0}, m6, m5, m4, m1 + REPX {pminsd x, m2}, m6, m5, m4, m1 + mova [r3+16*14], m4 + mova [r3+16*15], m1 + mova m4, [o(pd_2048)] + mova m1, [r3+ 9*16] + mova m2, [r3+ 8*16] + REPX {pmulld x, m7}, m5, m6, m1, m2 + paddd m5, m4 + paddd m6, m4 + psubd m0, m5, m1 ; t22 + paddd m5, m1 ; t25 + psubd m1, m6, m2 ; t23a + paddd m2, m6 ; t24a + REPX {psrad x, 12}, m0, m1, m2, m5 + mova [r3+16*6], m0 + mova [r3+16*7], m1 + mova [r3+16*8], m2 + mova [r3+16*9], m5 +%endif + ret + + ; final sumsub for idct16 as well as idct32, plus final downshift +%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx + mova m%4, [r3+16*(23-%1)] + pmaxsd m%1, m12 + pminsd m%1, m13 + psubd m%3, m%1, m%4 ; idct16 out15 - n + paddd m%1, m%4 ; idct16 out0 + n + pmaxsd m%1, m12 + pmaxsd m%3, m12 + pminsd m%1, m13 + pminsd m%3, m13 + paddd m%1, m11 + paddd m%3, m11 + mova m%5, [r3+16*( 0+%1)] + mova m%2, [r3+16*(15-%1)] + psubd m%4, m%1, m%2 ; out31 - n + paddd m%1, m%2 ; out0 + n + paddd m%2, m%3, m%5 ; out15 - n + psubd m%3, m%5 ; out16 + n + REPX {psrad x, %6}, m%1, m%3, m%2, m%4 +%endmacro + +.round_dct32: +%if ARCH_X86_64 + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31 + mova [r3+ 0*16], m6 + mova [r3+23*16], m7 + IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30 + packssdw m0, m1 ; 0 1 + packssdw m14, m15 ; 14 15 + packssdw m8, m6 ; 16 17 + packssdw m7, m9 ; 30 31 + mova [r3+16*15], m14 + mova [r3+16*14], m7 + IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29 + IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28 + packssdw m2, m3 ; 2 3 + packssdw m14, m15 ; 12 13 + packssdw m10, m1 ; 18 19 + packssdw m9, m7 ; 28 29 + mova [r3+16*13], m14 + mova [r3+16*12], m9 + IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27 + IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26 + packssdw m4, m5 ; 4 5 + packssdw m14, m15 ; 10 11 + packssdw m1, m3 ; 20 21 + packssdw m9, m7 ; 26 27 + mova [r3+16*11], m14 + mova [r3+16*10], m9 + mova m6, [r3+ 0*16] + mova m7, [r3+23*16] + IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25 + IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24 + packssdw m6, m7 ; 6 7 + packssdw m11, m15 ; 8 9 + packssdw m14, m3 ; 22 23 + packssdw m9, m5 ; 24 25 + mova [r3+16*9], m11 + mova [r3+16*8], m9 + mova m12, m1 + ret +%else + mova [r3+16*16], m0 + mova [r3+17*16], m1 + mova [r3+18*16], m2 + mova [r3+19*16], m3 + mova [r3+20*16], m4 + mova [r3+21*16], m5 + mova [r3+22*16], m6 + mova [r3+23*16], m7 + mova m1, [o(pd_2)] + mova m2, [o(clip_18b_min)] + mova m3, [o(clip_18b_max)] + + mov r4, 15*16 +.loop_dct32_end: + mova m0, [r3+16*16] + mova m6, [r3+16*24] + pmaxsd m0, m2 + pminsd m0, m3 + psubd m5, m0, m6 ; idct16 out15 - n + paddd m0, m6 ; idct16 out0 + n + pmaxsd m0, m2 + pmaxsd m5, m2 + pminsd m0, m3 + pminsd m5, m3 + paddd m0, m1 + paddd m5, m1 + mova m7, [r3] + mova m4, [r3+r4] + psubd m6, m0, m4 ; out31 - n + paddd m0, m4 ; out0 + n + paddd m4, m5, m7 ; out15 - n + psubd m5, m7 ; out16 + n + REPX {psrad x, 2}, m0, m5, m4, m6 + mova [r3], m0 + mova [r3+r4], m4 + mova [r3+16*16], m5 + mova [r3+24*16], m6 + add r3, 16 + sub r4, 32 + jg .loop_dct32_end + ret +%endif + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 8 +.dconly1: + add r5d, 640 + sar r5d, 10 +.dconly2: + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 +.dconly_loop: + mova m1, [dstq+16*0] + mova m2, [dstq+16*1] + mova m3, [dstq+16*2] + mova m4, [dstq+16*3] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m2 + mova [dstq+16*2], m3 + mova [dstq+16*3], m4 + add dstq, strideq + dec r3d + jg .dconly_loop + RET + +cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + + ; remove entirely-zero iterations +%undef cmp + mov r5d, 8 +.zero_loop: + sub r5d, 2 + cmp eobw, word [o2(tbl_32x16_2d)+r5] + jl .zero_loop + + ; actual first pass after skipping all-zero data +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+64* 1+r5*8] + mova m1, [cq+64* 7+r5*8] + mova m2, [cq+64* 9+r5*8] + mova m3, [cq+64*15+r5*8] + mova m4, [cq+64*17+r5*8] + mova m5, [cq+64*23+r5*8] + mova m6, [cq+64*25+r5*8] + mova m7, [cq+64*31+r5*8] + mov r3, rsp + call m(idct_8x4_internal_16bpc).rect2_mul + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 + + mova m0, [cq+64* 3+r5*8] + mova m1, [cq+64* 5+r5*8] + mova m2, [cq+64*11+r5*8] + mova m3, [cq+64*13+r5*8] + mova m4, [cq+64*19+r5*8] + mova m5, [cq+64*21+r5*8] + mova m6, [cq+64*27+r5*8] + mova m7, [cq+64*29+r5*8] +%if ARCH_X86_32 + add r3, 16*8 +%endif + call m(idct_8x4_internal_16bpc).rect2_mul +%if ARCH_X86_32 + sub r3, 16*8 +%endif + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 + add r3, 16*(16+4*ARCH_X86_32) + + mova m0, [cq+64* 2+r5*8] + mova m1, [cq+64* 6+r5*8] + mova m2, [cq+64*10+r5*8] + mova m3, [cq+64*14+r5*8] + mova m4, [cq+64*18+r5*8] + mova m5, [cq+64*22+r5*8] + mova m6, [cq+64*26+r5*8] + mova m7, [cq+64*30+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+64* 0+r5*8] + mova m1, [cq+64* 4+r5*8] + mova m2, [cq+64* 8+r5*8] + mova m3, [cq+64*12+r5*8] + mova m4, [cq+64*16+r5*8] + mova m5, [cq+64*20+r5*8] + mova m6, [cq+64*24+r5*8] + mova m7, [cq+64*28+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + sub r3, 16*(16+4*ARCH_X86_32) + call .round_dct32 + +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).transpose4x8packed + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+64* 8+r5*8], m8 + mova [cq+64* 9+r5*8], m9 + mova [cq+64*10+r5*8], m10 + mova [cq+64*11+r5*8], m11 + mova m8, [r3+16* 9] ; 8 9 + mova m10, [r3+16*11] ; 10 11 + mova m12, [r3+16*13] ; 12 13 + mova m14, [r3+16*15] ; 14 15 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+64* 4+r5*8], m8 + mova [cq+64* 5+r5*8], m9 + mova [cq+64* 6+r5*8], m10 + mova [cq+64* 7+r5*8], m11 + mova m8, [r3+16* 8] ; 24 25 + mova m10, [r3+16*10] ; 26 27 + mova m12, [r3+16*12] ; 28 29 + mova m14, [r3+16*14] ; 30 31 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+64*12+r5*8], m8 + mova [cq+64*13+r5*8], m9 + mova [cq+64*14+r5*8], m10 + mova [cq+64*15+r5*8], m11 +%else + sub r3, 8*16 + mova m0, [r3+ 8*16] + mova m2, [r3+10*16] + mova m4, [r3+12*16] + mova m6, [r3+14*16] + packssdw m0, [r3+ 9*16] + packssdw m2, [r3+11*16] + packssdw m4, [r3+13*16] + packssdw m6, [r3+15*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+64* 4+r5*8], m0 + mova [cq+64* 5+r5*8], m1 + mova [cq+64* 6+r5*8], m2 + mova [cq+64* 7+r5*8], m3 + mova m0, [r3+16*16] + mova m2, [r3+18*16] + mova m4, [r3+20*16] + mova m6, [r3+22*16] + packssdw m0, [r3+17*16] + packssdw m2, [r3+19*16] + packssdw m4, [r3+21*16] + packssdw m6, [r3+23*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+64* 8+r5*8], m0 + mova [cq+64* 9+r5*8], m1 + mova [cq+64*10+r5*8], m2 + mova [cq+64*11+r5*8], m3 + mova m0, [r3+31*16] + mova m2, [r3+29*16] + mova m4, [r3+27*16] + mova m6, [r3+25*16] + packssdw m0, [r3+30*16] + packssdw m2, [r3+28*16] + packssdw m4, [r3+26*16] + packssdw m6, [r3+24*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+64*12+r5*8], m0 + mova [cq+64*13+r5*8], m1 + mova [cq+64*14+r5*8], m2 + mova [cq+64*15+r5*8], m3 + mova m0, [r3+ 0*16] + mova m2, [r3+ 2*16] + mova m4, [r3+ 4*16] + mova m6, [r3+ 6*16] + packssdw m0, [r3+ 1*16] + packssdw m2, [r3+ 3*16] + packssdw m4, [r3+ 5*16] + packssdw m6, [r3+ 7*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + mova [cq+64* 0+r5*8], m0 + mova [cq+64* 1+r5*8], m1 + mova [cq+64* 2+r5*8], m2 + mova [cq+64* 3+r5*8], m3 + pxor m0, m0 + REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + sub r5d, 2 + jge .loop_pass1 + + ; pass=2, we need to call this otherwise the stack pointer has + ; the wrong offset in the 8-bit code + call .pass2 + RET + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%endif + mov r7, dstq +%else + mov [rsp+2*gprsize+16*16], dstq +%endif + lea r3, [strideq*3] + mov r4d, 4 + jmp m(idct_16x16_internal_16bpc).loop_pass2 + +.round_dct32: +%if ARCH_X86_64 + psrld m11, 11 ; pd_1 + IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31 + mova [r3+ 0*16], m6 + mova [r3+23*16], m7 + IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30 + packssdw m0, m1 ; 0 1 + packssdw m14, m15 ; 14 15 + packssdw m8, m6 ; 16 17 + packssdw m7, m9 ; 30 31 + mova [r3+16*15], m14 + mova [r3+16*14], m7 + IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29 + IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28 + packssdw m2, m3 ; 2 3 + packssdw m14, m15 ; 12 13 + packssdw m10, m1 ; 18 19 + packssdw m9, m7 ; 28 29 + mova [r3+16*13], m14 + mova [r3+16*12], m9 + IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27 + IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26 + packssdw m4, m5 ; 4 5 + packssdw m14, m15 ; 10 11 + packssdw m1, m3 ; 20 21 + packssdw m9, m7 ; 26 27 + mova [r3+16*11], m14 + mova [r3+16*10], m9 + mova m6, [r3+ 0*16] + mova m7, [r3+23*16] + IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25 + IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24 + packssdw m6, m7 ; 6 7 + packssdw m11, m15 ; 8 9 + packssdw m14, m3 ; 22 23 + packssdw m9, m5 ; 24 25 + mova [r3+16*9], m11 + mova [r3+16*8], m9 + mova m12, m1 + ret +%else + mova [r3+16*16], m0 + mova [r3+17*16], m1 + mova [r3+18*16], m2 + mova [r3+19*16], m3 + mova [r3+20*16], m4 + mova [r3+21*16], m5 + mova [r3+22*16], m6 + mova [r3+23*16], m7 + pcmpeqd m1, m1 ; -1 + mova m2, [o(clip_18b_min)] + mova m3, [o(clip_18b_max)] + + mov r4, 15*16 +.loop_dct32_end: + mova m0, [r3+16*16] + mova m6, [r3+16*24] + psubd m5, m0, m6 ; idct16 out15 - n + paddd m0, m6 ; idct16 out0 + n + pmaxsd m0, m2 + pmaxsd m5, m2 + pminsd m0, m3 + pminsd m5, m3 + psubd m0, m1 + psubd m5, m1 + mova m7, [r3] + mova m4, [r3+r4] + psubd m6, m0, m4 ; out31 - n + paddd m0, m4 ; out0 + n + paddd m4, m5, m7 ; out15 - n + psubd m5, m7 ; out16 + n + REPX {psrad x, 1}, m0, m5, m4, m6 + mova [r3], m0 + mova [r3+r4], m4 + mova [r3+16*16], m5 + mova [r3+24*16], m6 + add r3, 16 + sub r4, 32 + jg .loop_dct32_end + ret +%endif + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 16 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 + +cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + + ; remove entirely-zero iterations +%if ARCH_X86_32 + mov [rsp+5*32*16+1*gprsize], dstq +%elif WIN64 + mov [rsp+5*32*16+1*gprsize], r7 +%endif +%undef cmp + mov r5d, 14 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + mova [rsp+32*16+r5*8+0*32*16], m0 + mova [rsp+40*16+r5*8+0*32*16], m0 + mova [rsp+32*16+t0*8+0*32*16], m0 + mova [rsp+32*16+t1*8+0*32*16], m0 + mova [rsp+32*16+r5*8+1*32*16], m0 + mova [rsp+40*16+r5*8+1*32*16], m0 + mova [rsp+32*16+t0*8+1*32*16], m0 + mova [rsp+32*16+t1*8+1*32*16], m0 + mova [rsp+32*16+r5*8+2*32*16], m0 + mova [rsp+40*16+r5*8+2*32*16], m0 + mova [rsp+32*16+t0*8+2*32*16], m0 + mova [rsp+32*16+t1*8+2*32*16], m0 + mova [rsp+32*16+r5*8+3*32*16], m0 + mova [rsp+40*16+r5*8+3*32*16], m0 + mova [rsp+32*16+t0*8+3*32*16], m0 + mova [rsp+32*16+t1*8+3*32*16], m0 + sub r5d, 2 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + + ; actual first pass after skipping all-zero data + mov [rsp+gprsize*0+5*32*16], eobd +.loop_pass1: + mova m0, [cq+128* 1+r5*8] + mova m1, [cq+128* 7+r5*8] + mova m2, [cq+128* 9+r5*8] + mova m3, [cq+128*15+r5*8] + mova m4, [cq+128*17+r5*8] + mova m5, [cq+128*23+r5*8] + mova m6, [cq+128*25+r5*8] + mova m7, [cq+128*31+r5*8] +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mov r3, rsp + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 + mova m0, [cq+128* 3+r5*8] + mova m1, [cq+128* 5+r5*8] + mova m2, [cq+128*11+r5*8] + mova m3, [cq+128*13+r5*8] + mova m4, [cq+128*19+r5*8] + mova m5, [cq+128*21+r5*8] + mova m6, [cq+128*27+r5*8] + mova m7, [cq+128*29+r5*8] + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 + mova m0, [cq+128* 2+r5*8] + mova m1, [cq+128* 6+r5*8] + mova m2, [cq+128*10+r5*8] + mova m3, [cq+128*14+r5*8] + mova m4, [cq+128*18+r5*8] + mova m5, [cq+128*22+r5*8] + mova m6, [cq+128*26+r5*8] + mova m7, [cq+128*30+r5*8] + add r3, 16*(16+4*ARCH_X86_32) + call m(idct_16x4_internal_16bpc).main_oddhalf + mova m0, [cq+128* 0+r5*8] + mova m1, [cq+128* 4+r5*8] + mova m2, [cq+128* 8+r5*8] + mova m3, [cq+128*12+r5*8] + mova m4, [cq+128*16+r5*8] + mova m5, [cq+128*20+r5*8] + mova m6, [cq+128*24+r5*8] + mova m7, [cq+128*28+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + sub r3, 16*(16+4*ARCH_X86_32) + call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32 + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).transpose4x8packed + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+32*16+r5*8+2*32*16], m8 + mova [rsp+40*16+r5*8+2*32*16], m10 + mova [rsp+32*16+t1*8+2*32*16], m9 + mova [rsp+32*16+t0*8+2*32*16], m11 + mova m8, [r3+16* 9] ; 8 9 + mova m10, [r3+16*11] ; 10 11 + mova m12, [r3+16*13] ; 12 13 + mova m14, [r3+16*15] ; 14 15 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+32*16+r5*8+1*32*16], m8 + mova [rsp+40*16+r5*8+1*32*16], m10 + mova [rsp+32*16+t1*8+1*32*16], m9 + mova [rsp+32*16+t0*8+1*32*16], m11 + mova m8, [r3+16* 8] ; 24 25 + mova m10, [r3+16*10] ; 26 27 + mova m12, [r3+16*12] ; 28 29 + mova m14, [r3+16*14] ; 30 31 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+32*16+r5*8+3*32*16], m8 + mova [rsp+40*16+r5*8+3*32*16], m10 + mova [rsp+32*16+t1*8+3*32*16], m9 + mova [rsp+32*16+t0*8+3*32*16], m11 +%else + sub r3, 8*16 + mova m0, [r3+ 8*16] + mova m2, [r3+10*16] + mova m4, [r3+12*16] + mova m6, [r3+14*16] + packssdw m0, [r3+ 9*16] + packssdw m2, [r3+11*16] + packssdw m4, [r3+13*16] + packssdw m6, [r3+15*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+32*16+r5*8+1*32*16], m0 + mova [rsp+40*16+r5*8+1*32*16], m2 + mova [rsp+32*16+t1*8+1*32*16], m1 + mova [rsp+32*16+t0*8+1*32*16], m3 + mova m0, [r3+16*16] + mova m2, [r3+18*16] + mova m4, [r3+20*16] + mova m6, [r3+22*16] + packssdw m0, [r3+17*16] + packssdw m2, [r3+19*16] + packssdw m4, [r3+21*16] + packssdw m6, [r3+23*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+32*16+r5*8+2*32*16], m0 + mova [rsp+40*16+r5*8+2*32*16], m2 + mova [rsp+32*16+t1*8+2*32*16], m1 + mova [rsp+32*16+t0*8+2*32*16], m3 + mova m0, [r3+31*16] + mova m2, [r3+29*16] + mova m4, [r3+27*16] + mova m6, [r3+25*16] + packssdw m0, [r3+30*16] + packssdw m2, [r3+28*16] + packssdw m4, [r3+26*16] + packssdw m6, [r3+24*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+32*16+r5*8+3*32*16], m0 + mova [rsp+40*16+r5*8+3*32*16], m2 + mova [rsp+32*16+t1*8+3*32*16], m1 + mova [rsp+32*16+t0*8+3*32*16], m3 + mova m0, [r3+ 0*16] + mova m2, [r3+ 2*16] + mova m4, [r3+ 4*16] + mova m6, [r3+ 6*16] + packssdw m0, [r3+ 1*16] + packssdw m2, [r3+ 3*16] + packssdw m4, [r3+ 5*16] + packssdw m6, [r3+ 7*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + pxor m7, m7 + ; clear lower half of [cq] + REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + mova [rsp+32*16+r5*8+0*32*16], m0 + mova [rsp+40*16+r5*8+0*32*16], m2 + mova [rsp+32*16+t1*8+0*32*16], m1 + mova [rsp+32*16+t0*8+0*32*16], m3 + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 code starts here + mov eobd, [rsp+gprsize*0+5*32*16] + add rsp, 29*16 + cmp eobd, 36 + jl .load_veryfast + cmp eobd, 136 + jl .load_fast + ; load normal + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] + jmp .run +.load_fast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + jmp .run +.load_veryfast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + ; fall-through +.run: +%if ARCH_X86_64 + lea r2, [dstq+64] + mov r7, -8 +%else + lea r2, [rsp+(4*32+3)*16] + mov dword [r2+0*gprsize], 4 +%endif + jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 32 + add rsp, (5*32+1-(24+8*ARCH_X86_32))*16 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1 + +cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \ + 0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 2, 0 + mov [rsp+gprsize*1+(64*2+12)*16], r0 + mov [rsp+gprsize*2+(64*2+12)*16], r1 + mov [rsp+gprsize*3+(64*2+12)*16], r2 +%else + DECLARE_REG_TMP 8, 9, 4, 7 + mov [rsp+gprsize*1+(64*2+12)*16], r9 +%if WIN64 + mov [rsp+gprsize*2+(64*2+12)*16], r7 + mov [rsp+gprsize*3+(64*2+12)*16], r8 +%endif +%endif +%undef cmp + ; remove entirely-zero iterations + mov r5d, 7*2 + cmp eobw, word [o2(tbl_16x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 + mova [rsp+12*16+t0*8], m0 + mova [rsp+12*16+t1*8], m0 + mova [rsp+12*16+t2*8], m0 + mova [rsp+12*16+t3*8], m0 + mova [rsp+76*16+t0*8], m0 + mova [rsp+76*16+t1*8], m0 + mova [rsp+76*16+t2*8], m0 + mova [rsp+76*16+t3*8], m0 + sub r5d, 2 + cmp eobw, word [o2(tbl_16x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + ; actual first pass after skipping all-zero data + mov [rsp+gprsize*0+(64*2+12)*16], eobd + mov r3, rsp +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 6, 0 + mov r2, [rsp+gprsize*3+(64*2+12)*16] + mov [rsp+gprsize*3+(64*2+12)*16], r6 +%endif +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 1*128+r5*8] + mova m1, [cq+ 3*128+r5*8] + mova m2, [cq+ 5*128+r5*8] + mova m3, [cq+ 7*128+r5*8] + mova m4, [cq+ 9*128+r5*8] + mova m5, [cq+11*128+r5*8] + mova m6, [cq+13*128+r5*8] + mova m7, [cq+15*128+r5*8] + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+ 0*128+r5*8] + mova m1, [cq+ 2*128+r5*8] + mova m2, [cq+ 4*128+r5*8] + mova m3, [cq+ 6*128+r5*8] + mova m4, [cq+ 8*128+r5*8] + mova m5, [cq+10*128+r5*8] + mova m6, [cq+12*128+r5*8] + mova m7, [cq+14*128+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + call m(idct_16x16_internal_16bpc).round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + call m(idct_8x4_internal_16bpc).transpose4x8packed + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 +%if ARCH_X86_64 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+76*16+t0*8], m8 + mova [rsp+76*16+t1*8], m9 + mova [rsp+76*16+t2*8], m10 + mova [rsp+76*16+t3*8], m11 +%else + mova [rsp+76*16+t0*8], m0 + mova [rsp+76*16+t1*8], m1 + mova [rsp+76*16+t2*8], m2 + mova [rsp+76*16+t3*8], m3 + mova m0, [rsp+ 8*16] + mova m2, [rsp+ 9*16] + mova m4, [rsp+10*16] + mova m6, [rsp+11*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + mova [rsp+12*16+t0*8], m0 + mova [rsp+12*16+t1*8], m1 + mova [rsp+12*16+t2*8], m2 + mova [rsp+12*16+t3*8], m3 +%if ARCH_X86_32 + mov r6, [rsp+gprsize*3+(64*2+12)*16] +%endif + pxor m7, m7 + REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 + mov eobd, [rsp+gprsize*0+(64*2+12)*16] + cmp eobd, 151 + jl .fast + ; fall-through +%if ARCH_X86_64 + DECLARE_REG_TMP 8, 9 +%else + DECLARE_REG_TMP 1, 5 +%endif + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] + jmp .run +.fast: + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] +.run: + add rsp, 9*16 + +%if ARCH_X86_64 + lea r2, [dstq+32] + mov r7, -4 +%else + lea r2, [rsp+(64*2+3)*16] + mov [r2+4*gprsize], t0 + mov [r2+5*gprsize], t1 + mov r1, [r2+2*gprsize] + mov dword [r2+0*gprsize], 2 +%endif +.loop_pass2: +%if ARCH_X86_32 + mov dstq, [r2+1*gprsize] +%endif + call .pass2 + add rsp, 64*16 +%if ARCH_X86_64 + add r7, 2 + lea dstq, [r2+r7*8] + jl .loop_pass2 +%else + add dword [r2+1*gprsize], 16 + dec dword [r2+0*gprsize] + jg .loop_pass2 +%endif +%assign stack_size (stack_size-(64*2+9)*16) +%if STACK_ALIGNMENT >= 16 +%assign stack_size_padded (stack_size_padded-(64*2+9)*16) +%assign stack_offset (stack_offset-(64*2+9)*16) +%else +%xdefine rstkm [rsp + stack_size] +%endif +%if ARCH_X86_64 + mov r9, [rsp+gprsize*1+3*16] +%if WIN64 + mov r7, [rsp+gprsize*2+3*16] + mov r8, [rsp+gprsize*3+3*16] +%endif +%endif + RET + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m0, [rsp+gprsize+16* 3] + mova m1, [rsp+gprsize+16* 4] + mova m2, [rsp+gprsize+16* 5] + mova m3, [rsp+gprsize+16* 6] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+ 3*16], m0 + mova [rsp+gprsize+ 4*16], m1 + mova [rsp+gprsize+ 5*16], m2 + mova [rsp+gprsize+ 6*16], m3 + mova [rsp+gprsize+ 7*16], m4 + mova [rsp+gprsize+ 8*16], m5 + mova [rsp+gprsize+ 9*16], m6 + mova [rsp+gprsize+10*16], m7 + mova m0, [rsp+gprsize+16*11] + mova m1, [rsp+gprsize+16*12] + mova m2, [rsp+gprsize+16*13] + mova m3, [rsp+gprsize+16*14] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + mova m7, [rsp+gprsize+ 0*16] + mova [rsp+gprsize+11*16], m0 + mova [rsp+gprsize+12*16], m1 + mova [rsp+gprsize+13*16], m2 + mova [rsp+gprsize+14*16], m3 + mova [rsp+gprsize+15*16], m4 + mova [rsp+gprsize+16*16], m5 + mova [rsp+gprsize+17*16], m6 + mova [rsp+gprsize+18*16], m7 +%if ARCH_X86_64 + call r8 +%else + call [r2+4*gprsize] +%endif + mova [rsp+gprsize+ 3*16], m0 + mova [rsp+gprsize+ 5*16], m2 + mova [rsp+gprsize+ 8*16], m5 + mova [rsp+gprsize+10*16], m7 +%if ARCH_X86_64 + call r9 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%else + call [r2+5*gprsize] +%endif + lea r3, [strideq*3] + lea r4, [rsp+gprsize+ 3*16] +%if ARCH_X86_64 + mov r6d, 8 +%else + mov dword [r2+2*gprsize], 8 +%endif +.loop_write: + mova m0, [r4+0*16] + mova m1, [r4+1*16] + mova m2, [r4+2*16] + mova m3, [r4+3*16] + mova m4, [r4+4*16] + mova m5, [r4+5*16] + mova m6, [r4+6*16] + mova m7, [r4+7*16] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + lea dstq, [dstq+strideq*8] + add r4, 8*16 +%if ARCH_X86_64 + dec r6d +%else + dec dword [r2+2*gprsize] +%endif + jg .loop_write + ret + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 64 + add r5d, 640 + sar r5d, 10 + add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 + +cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \ + 0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 2, 0 + mov [rsp+gprsize*1+(64*4+32)*16], r0 + mov [rsp+gprsize*2+(64*4+32)*16], r1 + mov [rsp+gprsize*3+(64*4+32)*16], r2 +%else + DECLARE_REG_TMP 8, 9, 4, 7 + mov [rsp+gprsize*1+(64*4+32)*16], r9 +%if WIN64 + mov [rsp+gprsize*2+(64*4+32)*16], r7 + mov [rsp+gprsize*3+(64*4+32)*16], r8 +%endif +%endif +%undef cmp + ; remove entirely-zero iterations + mov r5d, 7*2 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 + mova [rsp+ 32*16+t0*8], m0 + mova [rsp+ 32*16+t1*8], m0 + mova [rsp+ 32*16+t2*8], m0 + mova [rsp+ 32*16+t3*8], m0 + mova [rsp+ 96*16+t0*8], m0 + mova [rsp+ 96*16+t1*8], m0 + mova [rsp+ 96*16+t2*8], m0 + mova [rsp+ 96*16+t3*8], m0 + mova [rsp+160*16+t0*8], m0 + mova [rsp+160*16+t1*8], m0 + mova [rsp+160*16+t2*8], m0 + mova [rsp+160*16+t3*8], m0 + mova [rsp+224*16+t0*8], m0 + mova [rsp+224*16+t1*8], m0 + mova [rsp+224*16+t2*8], m0 + mova [rsp+224*16+t3*8], m0 + sub r5d, 2 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + ; actual first pass after skipping all-zero data + mov [rsp+gprsize*0+(64*4+32)*16], eobd + mov r3, rsp +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 6, 0 + mov r2, [rsp+gprsize*3+(64*4+32)*16] + mov [rsp+gprsize*3+(64*4+32)*16], r6 +%endif +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+128* 1+r5*8] + mova m1, [cq+128* 7+r5*8] + mova m2, [cq+128* 9+r5*8] + mova m3, [cq+128*15+r5*8] + mova m4, [cq+128*17+r5*8] + mova m5, [cq+128*23+r5*8] + mova m6, [cq+128*25+r5*8] + mova m7, [cq+128*31+r5*8] + mov r3, rsp + call m(idct_8x4_internal_16bpc).rect2_mul + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 + + mova m0, [cq+128* 3+r5*8] + mova m1, [cq+128* 5+r5*8] + mova m2, [cq+128*11+r5*8] + mova m3, [cq+128*13+r5*8] + mova m4, [cq+128*19+r5*8] + mova m5, [cq+128*21+r5*8] + mova m6, [cq+128*27+r5*8] + mova m7, [cq+128*29+r5*8] +%if ARCH_X86_32 + add r3, 16*8 +%endif + call m(idct_8x4_internal_16bpc).rect2_mul +%if ARCH_X86_32 + sub r3, 16*8 +%endif + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 + add r3, 16*(16+4*ARCH_X86_32) + + mova m0, [cq+128* 2+r5*8] + mova m1, [cq+128* 6+r5*8] + mova m2, [cq+128*10+r5*8] + mova m3, [cq+128*14+r5*8] + mova m4, [cq+128*18+r5*8] + mova m5, [cq+128*22+r5*8] + mova m6, [cq+128*26+r5*8] + mova m7, [cq+128*30+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+128* 0+r5*8] + mova m1, [cq+128* 4+r5*8] + mova m2, [cq+128* 8+r5*8] + mova m3, [cq+128*12+r5*8] + mova m4, [cq+128*16+r5*8] + mova m5, [cq+128*20+r5*8] + mova m6, [cq+128*24+r5*8] + mova m7, [cq+128*28+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + sub r3, 16*(16+4*ARCH_X86_32) + call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32 + + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).transpose4x8packed + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+160*16+t0*8], m8 + mova [rsp+160*16+t1*8], m9 + mova [rsp+160*16+t2*8], m10 + mova [rsp+160*16+t3*8], m11 + mova m8, [r3+16* 9] ; 8 9 + mova m10, [r3+16*11] ; 10 11 + mova m12, [r3+16*13] ; 12 13 + mova m14, [r3+16*15] ; 14 15 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+ 96*16+t0*8], m8 + mova [rsp+ 96*16+t1*8], m9 + mova [rsp+ 96*16+t2*8], m10 + mova [rsp+ 96*16+t3*8], m11 + mova m8, [r3+16* 8] ; 24 25 + mova m10, [r3+16*10] ; 26 27 + mova m12, [r3+16*12] ; 28 29 + mova m14, [r3+16*14] ; 30 31 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+224*16+t0*8], m8 + mova [rsp+224*16+t1*8], m9 + mova [rsp+224*16+t2*8], m10 + mova [rsp+224*16+t3*8], m11 +%else + sub r3, 8*16 + mova m0, [r3+ 8*16] + mova m2, [r3+10*16] + mova m4, [r3+12*16] + mova m6, [r3+14*16] + packssdw m0, [r3+ 9*16] + packssdw m2, [r3+11*16] + packssdw m4, [r3+13*16] + packssdw m6, [r3+15*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+ 96*16+t0*8], m0 + mova [rsp+ 96*16+t1*8], m1 + mova [rsp+ 96*16+t2*8], m2 + mova [rsp+ 96*16+t3*8], m3 + mova m0, [r3+16*16] + mova m2, [r3+18*16] + mova m4, [r3+20*16] + mova m6, [r3+22*16] + packssdw m0, [r3+17*16] + packssdw m2, [r3+19*16] + packssdw m4, [r3+21*16] + packssdw m6, [r3+23*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+160*16+t0*8], m0 + mova [rsp+160*16+t1*8], m1 + mova [rsp+160*16+t2*8], m2 + mova [rsp+160*16+t3*8], m3 + mova m0, [r3+31*16] + mova m2, [r3+29*16] + mova m4, [r3+27*16] + mova m6, [r3+25*16] + packssdw m0, [r3+30*16] + packssdw m2, [r3+28*16] + packssdw m4, [r3+26*16] + packssdw m6, [r3+24*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+224*16+t0*8], m0 + mova [rsp+224*16+t1*8], m1 + mova [rsp+224*16+t2*8], m2 + mova [rsp+224*16+t3*8], m3 + mova m0, [r3+ 0*16] + mova m2, [r3+ 2*16] + mova m4, [r3+ 4*16] + mova m6, [r3+ 6*16] + packssdw m0, [r3+ 1*16] + packssdw m2, [r3+ 3*16] + packssdw m4, [r3+ 5*16] + packssdw m6, [r3+ 7*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + mova [rsp+ 32*16+t0*8], m0 + mova [rsp+ 32*16+t1*8], m1 + mova [rsp+ 32*16+t2*8], m2 + mova [rsp+ 32*16+t3*8], m3 + pxor m0, m0 + REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 +%if ARCH_X86_32 + mov r6, [rsp+gprsize*3+(64*4+32)*16] +%endif + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 + mov eobd, [rsp+gprsize*0+(64*4+32)*16] + cmp eobd, 136 + jl .fast + ; fall-through +%if ARCH_X86_64 + DECLARE_REG_TMP 8, 9 +%else + DECLARE_REG_TMP 1, 5 +%endif + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] + jmp .run +.fast: + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] +.run: + add rsp, 29*16 + +%if ARCH_X86_64 + lea r2, [dstq+64] + mov r7, -8 +%else + lea r2, [rsp+(64*4+3)*16] + mov [r2+4*gprsize], t0 + mov [r2+5*gprsize], t1 + mov r1, [r2+2*gprsize] + mov dword [r2+0*gprsize], 4 +%endif + jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 64 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 + add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 + +cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + + ; remove entirely-zero iterations +%undef cmp + mov r5d, 8 +.zero_loop: + sub r5d, 2 + cmp eobw, word [o2(tbl_32x16_2d)+r5] + jl .zero_loop + + ; actual first pass after skipping all-zero data +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + + mov r3, rsp + lea r4, [o(idct64_mul_16bpc)] + mova m0, [cq+64* 1+r5*8] + mova m1, [cq+64*31+r5*8] + mova m2, [cq+64*17+r5*8] + mova m3, [cq+64*15+r5*8] + call .main_part1 + mova m0, [cq+64* 7+r5*8] + mova m1, [cq+64*25+r5*8] + mova m2, [cq+64*23+r5*8] + mova m3, [cq+64* 9+r5*8] + call .main_part1 + mova m0, [cq+64* 5+r5*8] + mova m1, [cq+64*27+r5*8] + mova m2, [cq+64*21+r5*8] + mova m3, [cq+64*11+r5*8] + call .main_part1 + mova m0, [cq+64* 3+r5*8] + mova m1, [cq+64*29+r5*8] + mova m2, [cq+64*19+r5*8] + mova m3, [cq+64*13+r5*8] + call .main_part1 + call .main_part2 + + mova m0, [cq+64* 2+r5*8] + mova m1, [cq+64*14+r5*8] + mova m2, [cq+64*18+r5*8] + mova m3, [cq+64*30+r5*8] + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast + + mova m0, [cq+64* 6+r5*8] + mova m1, [cq+64*10+r5*8] + mova m2, [cq+64*22+r5*8] + mova m3, [cq+64*26+r5*8] + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast + add r3, 16*(24+4*ARCH_X86_32) + + mova m0, [cq+64* 4+r5*8] + mova m1, [cq+64*12+r5*8] + mova m2, [cq+64*20+r5*8] + mova m3, [cq+64*28+r5*8] + call m(idct_16x4_internal_16bpc).main_oddhalf_fast + + mova m0, [cq+64* 0+r5*8] + mova m1, [cq+64* 8+r5*8] + mova m2, [cq+64*16+r5*8] + mova m3, [cq+64*24+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1_fast + call m(idct_8x4_internal_16bpc).round + mova [r3-(7+4*ARCH_X86_32)*16], m1 + mova [r3-(6+4*ARCH_X86_32)*16], m2 + mova [r3-(5+4*ARCH_X86_32)*16], m3 + mova [r3-(4+4*ARCH_X86_32)*16], m4 + mova [r3-(3+4*ARCH_X86_32)*16], m5 + mova [r3-(2+4*ARCH_X86_32)*16], m6 + mova [r3-(1+4*ARCH_X86_32)*16], m7 + sub r3, 16*(40+4*ARCH_X86_32-4) + +%if ARCH_X86_64 + psrld m15, m11, 10 ; pd_2 +%else + mova m7, [o(pd_2)] +%endif + call .main_end_loop_start + + lea r3, [rsp+56*16] + lea r4, [cq+r5*8+64*28] + call .shift_transpose + sub r5d, 2 + jge .loop_pass1 + + ; pass=2, we need to call this otherwise the stack pointer has + ; the wrong offset in the 8-bit code + call .pass2 + RET + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%endif + mov r7, dstq +%else + mov [rsp+2*gprsize+16*16], dstq +%endif + lea r3, [strideq*3] + mov r4d, 8 + jmp m(idct_16x16_internal_16bpc).loop_pass2 + +.main_part1: ; idct64 steps 1-5 + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a +%if ARCH_X86_64 + movd m7, [r4+4*0] + movd m8, [r4+4*1] + movd m6, [r4+4*2] + movd m9, [r4+4*3] + movd m5, [r4+4*4] + movd m10, [r4+4*5] + movd m4, [r4+4*6] + movd m15, [r4+4*7] + REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15 + pmulld m7, m0 ; t63a + pmulld m0, m8 ; t32a + pmulld m6, m1 ; t62a + pmulld m1, m9 ; t33a + pmulld m5, m2 ; t61a + pmulld m2, m10 ; t34a + pmulld m4, m3 ; t60a + pmulld m3, m15 ; t35a + movd m10, [r4+4*8] + movd m15, [r4+4*9] + REPX {pshufd x, x, q0000}, m10, m15 + REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 + REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 + psubd m8, m0, m1 ; t33 + paddd m0, m1 ; t32 + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m5 ; t61 + paddd m4, m5 ; t60 + REPX {pmaxsd x, m12}, m8, m1, m6, m2 + REPX {pminsd x, m13}, m8, m1, m6, m2 + ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a + REPX {pmaxsd x, m12}, m0, m3, m7, m4 + REPX {pminsd x, m13}, m0, m3, m7, m4 + movd m10, [r4+4*10] + movd m15, [r4+4*11] + REPX {pshufd x, x, q0000}, m10, m15 + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m8, m2 ; t61 + paddd m8, m2 ; t62 + REPX {pmaxsd x, m12}, m5, m3, m4, m6 + REPX {pminsd x, m13}, m5, m3, m4, m6 + ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 + ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a + REPX {pmaxsd x, m12}, m0, m7, m1, m8 + REPX {pminsd x, m13}, m0, m7, m1, m8 + add r4, 4*12 + mova [r3+16*0], m0 + mova [r3+16*7], m7 + mova [r3+16*1], m1 + mova [r3+16*6], m8 + mova [r3+16*2], m6 + mova [r3+16*5], m4 + mova [r3+16*3], m3 + mova [r3+16*4], m5 +%else + movd m7, [r4+4*0] + movd m6, [r4+4*2] + movd m5, [r4+4*4] + movd m4, [r4+4*6] + REPX {pshufd x, x, q0000}, m7, m6, m5, m4 + pmulld m7, m0 ; t63a + pmulld m6, m1 ; t62a + pmulld m5, m2 ; t61a + pmulld m4, m3 ; t60a + mova [r3+0*16], m6 + mova [r3+1*16], m7 + movd m6, [r4+4*1] + movd m7, [r4+4*3] + REPX {pshufd x, x, q0000}, m7, m6 + pmulld m0, m6 ; t32a + pmulld m1, m7 ; t33a + movd m6, [r4+4*5] + movd m7, [r4+4*7] + REPX {pshufd x, x, q0000}, m7, m6 + pmulld m2, m6 ; t34a + pmulld m3, m7 ; t35a + mova m6, [r3+0*16] + mova m7, [o(pd_2048)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3+1*16] + REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4 + mova [r3+0*16], m5 + psubd m5, m0, m1 ; t33 + paddd m0, m1 ; t32 + mova [r3+1*16], m0 + mova m0, [r3+0*16] + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m0 ; t61 + paddd m4, m0 ; t60 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4 + pmaxsd m0, [r3+1*16] + mova [r3+0*16], m0 + mova m0, [o(clip_18b_max)] + REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4 + pminsd m0, [r3+0*16] + mova [r3+0*16], m0 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m7 + mova m0, [o(pd_2048)] + movd m3, [r4+4*8] + movd m4, [r4+4*9] + REPX {pshufd x, x, q0000}, m3, m4 + mova [r3+4*16], m2 + ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a + mova m2, [r3+4*16] + mova [r3+4*16], m5 + ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a + mova m0, [r3+0*16] + mova m3, [r3+1*16] + mova m4, [r3+2*16] + mova m7, [r3+3*16] + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + mova [r3+0*16], m5 + mova m5, [r3+4*16] + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m5, m2 ; t61 + paddd m2, m5 ; t62 + mova m5, [o(clip_18b_min)] + REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2 + pmaxsd m5, [r3+0*16] + mova [r3+0*16], m5 + mova m5, [o(clip_18b_max)] + REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2 + pminsd m5, [r3+0*16] + mova [r3+16*0], m0 + mova [r3+16*7], m7 + mova [r3+16*1], m1 + mova [r3+16*6], m2 + mova [r3+16*2], m4 + mova m7, [o(pd_2048)] + movd m0, [r4+4*10] + movd m1, [r4+4*11] + REPX {pshufd x, x, q0000}, m0, m1 + ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60 + mova [r3+16*3], m3 + mova [r3+16*4], m5 + mova m4, [r3+2*16] + ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a + add r4, 4*12 + mova [r3+16*2], m6 + mova [r3+16*5], m4 +%endif + add r3, 16*8 + ret + +.main_part2: ; idct64 steps 6-9 + lea r4, [r3+16*7] +%if ARCH_X86_64 + mova m10, [o(pd_1567)] + mova m15, [o(pd_3784)] +.main_part2_loop: + mova m0, [r3-16*32] ; t32a + mova m1, [r4-16*24] ; t39a + mova m2, [r4-16*32] ; t63a + mova m3, [r3-16*24] ; t56a + mova m4, [r3-16*16] ; t40a + mova m5, [r4-16* 8] ; t47a + mova m6, [r4-16*16] ; t55a + mova m7, [r3-16* 8] ; t48a + psubd m8, m0, m1 ; t39 + paddd m0, m1 ; t32 + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + REPX {pmaxsd x, m12}, m8, m1, m3, m4 + REPX {pminsd x, m13}, m8, m1, m3, m4 + ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a + REPX {pmaxsd x, m12}, m0, m2, m5, m7 + REPX {pminsd x, m13}, m0, m5, m2, m7 + psubd m6, m2, m7 ; t48a + paddd m2, m7 ; t63a + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m8, m4 ; t55 + paddd m8, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + REPX {pmaxsd x, m12}, m6, m7, m5, m4 + REPX {pminsd x, m13}, m6, m7, m5, m4 + REPX {pmulld x, m14}, m6, m7, m5, m4 + REPX {pmaxsd x, m12}, m2, m0, m8, m1 + REPX {pminsd x, m13}, m2, m0, m8, m1 + paddd m6, m11 + paddd m5, m11 + psubd m3, m6, m7 ; t47 + paddd m6, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m3, m6, m7, m5 + mova [r4-16* 8], m2 + mova [r3-16*32], m0 + mova [r3-16* 8], m8 + mova [r4-16*32], m1 + mova [r4-16*24], m3 + mova [r3-16*16], m6 + mova [r3-16*24], m7 + mova [r4-16*16], m5 +%else +.main_part2_loop: + mova m0, [r3-16*32] ; t32a + mova m1, [r4-16*24] ; t39a + mova m2, [r4-16*32] ; t63a + mova m3, [r3-16*24] ; t56a + mova m4, [r3-16*16] ; t40a + mova m5, [r4-16* 8] ; t47a + mova m6, [r4-16*16] ; t55a + psubd m7, m0, m1 ; t39 + paddd m0, m1 ; t32 + mova [r3+0*16], m7 + mova m7, [r3-16* 8] ; t48a + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + mova m6, [o(clip_18b_min)] + REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7 + pmaxsd m6, [r3+0*16] + mova [r3+0*16], m6 + mova m6, [o(clip_18b_max)] + REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7 + pminsd m6, [r3+0*16] + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m5 + mova [r3+3*16], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a + ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a + mova m2, [r3+1*16] + mova m7, [r3+3*16] + psubd m5, m2, m7 ; t48a + paddd m2, m7 ; t63a + mova [r3+1*16], m5 + mova m0, [r3+0*16] + mova m5, [r3+2*16] + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m6, m4 ; t55 + paddd m6, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1 + pmaxsd m3, [r3+1*16] + mova [r3+0*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1 + pminsd m3, [r3+0*16] + mova [r4-16* 8], m2 + mova [r3-16*32], m0 + mova [r3-16* 8], m6 + mova [r4-16*32], m1 + mova m0, [o(pd_2896)] + mova m1, [o(pd_2048)] + REPX {pmulld x, m0}, m3, m7, m5, m4 + REPX {paddd x, m1}, m3, m5 + psubd m6, m3, m7 ; t47 + paddd m3, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m6, m3, m7, m5 + mova [r4-16*24], m6 + mova [r3-16*16], m3 + mova [r3-16*24], m7 + mova [r4-16*16], m5 +%endif + add r3, 16 + sub r4, 16 + cmp r3, r4 + jl .main_part2_loop + sub r3, 4*16 + ret + +.main_end_loop: + mova m0, [r3+16*28] ; idct8 0 + n +.main_end_loop_start: + mova m2, [r3+16*12] ; idct32 16 + n + mova m3, [r4+16*12] ; idct32 31 - n +%if ARCH_X86_64 + mova m1, [r4+16*28] ; idct16 15 - n + mova m4, [r4-16* 4] ; idct64 63 - n + mova m5, [r3-16* 4] ; idct64 48 + n + mova m6, [r4-16*20] ; idct64 47 - n + mova m7, [r3-16*20] ; idct64 32 + n + pmaxsd m0, m12 + pminsd m0, m13 + paddd m8, m0, m1 ; idct16 out0 + n + psubd m0, m1 ; idct16 out15 - n + REPX {pmaxsd x, m12}, m8, m0 + REPX {pminsd x, m13}, m8, m0 + paddd m1, m8, m3 ; idct32 out0 + n + psubd m8, m3 ; idct32 out31 - n + paddd m3, m0, m2 ; idct32 out15 - n + psubd m0, m2 ; idct32 out16 + n + REPX {pmaxsd x, m12}, m1, m8, m3, m0 + REPX {pminsd x, m13}, m1, m3, m8, m0 + REPX {paddd x, m15}, m1, m3, m0, m8 + paddd m2, m1, m4 ; idct64 out0 + n (unshifted) + psubd m1, m4 ; idct64 out63 - n (unshifted) + paddd m4, m3, m5 ; idct64 out15 - n (unshifted) + psubd m3, m5 ; idct64 out48 + n (unshifted) + paddd m5, m0, m6 ; idct64 out16 + n (unshifted) + psubd m0, m6 ; idct64 out47 - n (unshifted) + paddd m6, m8, m7 ; idct64 out31 - n (unshifted) + psubd m8, m7 ; idct64 out32 + n (unshifted) + mova [r3-16*20], m2 + mova [r4+16*28], m1 + mova [r4-16*20], m4 + mova [r3+16*28], m3 + mova [r3-16* 4], m5 + mova [r4+16*12], m0 + mova [r4-16* 4], m6 + mova [r3+16*12], m8 +%else + mova m5, [o(clip_18b_min)] + mova m6, [o(clip_18b_max)] + mova m1, [r3+16*44] ; idct16 15 - n + pmaxsd m0, m5 + pminsd m0, m6 + paddd m4, m0, m1 ; idct16 out0 + n + psubd m0, m1 ; idct16 out15 - n + REPX {pmaxsd x, m5}, m4, m0 + REPX {pminsd x, m6}, m4, m0 + paddd m1, m4, m3 ; idct32 out0 + n + psubd m4, m3 ; idct32 out31 - n + paddd m3, m0, m2 ; idct32 out15 - n + psubd m0, m2 ; idct32 out16 + n + REPX {pmaxsd x, m5}, m1, m4, m3, m0 + REPX {pminsd x, m6}, m1, m3, m4, m0 + REPX {paddd x, m7}, m1, m3, m0, m4 + mova m5, [r4-16* 4] ; idct64 63 - n + mova m6, [r3-16* 4] ; idct64 48 + n + paddd m2, m1, m5 ; idct64 out0 + n (unshifted) + psubd m1, m5 ; idct64 out63 - n (unshifted) + paddd m5, m3, m6 ; idct64 out15 - n (unshifted) + psubd m3, m6 ; idct64 out48 + n (unshifted) + mova [r4+16*28], m1 + mova [r3+16*28], m3 + mova m6, [r4-16*20] ; idct64 47 - n + mova m1, [r3-16*20] ; idct64 32 + n + mova [r3-16*20], m2 + mova [r4-16*20], m5 + paddd m5, m0, m6 ; idct64 out16 + n (unshifted) + psubd m0, m6 ; idct64 out47 - n (unshifted) + paddd m6, m4, m1 ; idct64 out31 - n (unshifted) + psubd m4, m1 ; idct64 out32 + n (unshifted) + mova [r3-16* 4], m5 + mova [r4+16*12], m0 + mova [r4-16* 4], m6 + mova [r3+16*12], m4 +%endif + sub r4, 16 + add r3, 16 + cmp r3, r4 + jl .main_end_loop + ret + +.shift_transpose: + mova m0, [r3+0*16] + mova m1, [r3+1*16] + mova m2, [r3+2*16] + mova m3, [r3+3*16] + mova m4, [r3+4*16] + mova m5, [r3+5*16] + mova m6, [r3+6*16] + mova m7, [r3+7*16] + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [r4+0*64], m0 + mova [r4+1*64], m1 + mova [r4+2*64], m2 + mova [r4+3*64], m3 + sub r4, 4*64 + sub r3, 8*16 + cmp r3, rsp + jg .shift_transpose + ret + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 16 +.dconly1: + add r5d, 640 + sar r5d, 10 +.dconly2: + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 +.dconly_loop: + paddw m1, m0, [dstq+16*0] + paddw m2, m0, [dstq+16*1] + paddw m3, m0, [dstq+16*2] + paddw m4, m0, [dstq+16*3] + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m2 + mova [dstq+16*2], m3 + mova [dstq+16*3], m4 + add dstq, 64 + btc r3d, 16 + jnc .dconly_loop + lea dstq, [dstq+strideq-128] + dec r3d + jg .dconly_loop + RET + +cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \ + 0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + DECLARE_REG_TMP 0, 4, 1 + mov [rsp+(8*32+64+8)*16+1*gprsize], dstq + mov [rsp+(8*32+64+8)*16+2*gprsize], strideq +%else + DECLARE_REG_TMP 4, 7, 8 +%if WIN64 + mov [rsp+(8*32+64+1)*16+1*gprsize], r7 + mov [rsp+64*16+0*gprsize], r8 +%endif +%endif +%undef cmp + ; remove entirely-zero iterations + mov r5d, 14 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + lea t2, [rsp+7*32*16] +.zero_loop_inner: + mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 + mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 + mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0 + mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0 + sub t2, 32*16 + cmp t2, rsp + jge .zero_loop_inner + sub r5d, 2 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd + ; actual first pass after skipping all-zero data +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + + mov r3, rsp + lea r4, [o(idct64_mul_16bpc)] + mova m0, [cq+128* 1+r5*8] + mova m1, [cq+128*31+r5*8] + mova m2, [cq+128*17+r5*8] + mova m3, [cq+128*15+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 7+r5*8] + mova m1, [cq+128*25+r5*8] + mova m2, [cq+128*23+r5*8] + mova m3, [cq+128* 9+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 5+r5*8] + mova m1, [cq+128*27+r5*8] + mova m2, [cq+128*21+r5*8] + mova m3, [cq+128*11+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 3+r5*8] + mova m1, [cq+128*29+r5*8] + mova m2, [cq+128*19+r5*8] + mova m3, [cq+128*13+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 + + mova m0, [cq+128* 2+r5*8] + mova m1, [cq+128*14+r5*8] + mova m2, [cq+128*18+r5*8] + mova m3, [cq+128*30+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast + + mova m0, [cq+128* 6+r5*8] + mova m1, [cq+128*10+r5*8] + mova m2, [cq+128*22+r5*8] + mova m3, [cq+128*26+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast + add r3, 16*(24+4*ARCH_X86_32) + + mova m0, [cq+128* 4+r5*8] + mova m1, [cq+128*12+r5*8] + mova m2, [cq+128*20+r5*8] + mova m3, [cq+128*28+r5*8] + call .rect2_mul_fast + call m(idct_16x4_internal_16bpc).main_oddhalf_fast + + mova m0, [cq+128* 0+r5*8] + mova m1, [cq+128* 8+r5*8] + mova m2, [cq+128*16+r5*8] + mova m3, [cq+128*24+r5*8] + call .rect2_mul_fast + call m(idct_8x4_internal_16bpc).main_pass1_fast + call m(idct_8x4_internal_16bpc).round + mova [r3-(7+4*ARCH_X86_32)*16], m1 + mova [r3-(6+4*ARCH_X86_32)*16], m2 + mova [r3-(5+4*ARCH_X86_32)*16], m3 + mova [r3-(4+4*ARCH_X86_32)*16], m4 + mova [r3-(3+4*ARCH_X86_32)*16], m5 + mova [r3-(2+4*ARCH_X86_32)*16], m6 + mova [r3-(1+4*ARCH_X86_32)*16], m7 + sub r3, 16*(40+4*ARCH_X86_32-4) + +%if ARCH_X86_64 + psrld m15, m11, 11 ; pd_1 +%else + mova m7, [o(pd_1)] +%endif + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start + + lea r3, [rsp+56*16] + lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16] + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + call .shift_transpose + ; zero cq + pxor m7, m7 + lea r4, [cq+30*128+r5*8] +.zero_cq_loop: + REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 + sub r4, 4*128 + cmp r4, cq + jg .zero_cq_loop + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 code starts here + mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16] +%if ARCH_X86_32 + mov strideq, [rsp+gprsize*2+(8*32+64+8)*16] +%elif WIN64 + mov r8, [rsp+gprsize*0+64*16] +%endif + add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16 + cmp eobd, 36 + jl .load_veryfast + cmp eobd, 136 + jl .load_fast + ; load normal + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] + jmp .run +.load_fast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + jmp .run +.load_veryfast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + ; fall-through +.run: +%if ARCH_X86_64 + lea r2, [dstq+128] + mov r7, -16 +%else + lea r2, [rsp+(8*32+3)*16] + mov dword [r2+0*gprsize], 8 +%endif + jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry + +.rect2_mul_fast: +%if ARCH_X86_64 + REPX {pmulld x, m14}, m0, m1, m2, m3 + REPX {paddd x, m11}, m0, m1, m2, m3 +%else + mova m4, [o(pd_2896)] + mova m5, [o(pd_2048)] + REPX {pmulld x, m4 }, m0, m1, m2, m3 + REPX {paddd x, m5 }, m0, m1, m2, m3 +%endif + REPX {psrad x, 12 }, m0, m1, m2, m3 + ret + +.shift_transpose: + mova m0, [r3+0*16] + mova m1, [r3+1*16] + mova m2, [r3+2*16] + mova m3, [r3+3*16] + mova m4, [r3+4*16] + mova m5, [r3+5*16] + mova m6, [r3+6*16] + mova m7, [r3+7*16] + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [t2+0*16+r5*8], m0 + mova [t2+8*16+r5*8], m2 + mova [t2+0*16+t0*8], m3 + mova [t2+0*16+t1*8], m1 + sub t2, 16*32 + sub r3, 8*16 + cmp r3, rsp + jg .shift_transpose + ret + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 32 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 + add rsp, (1+8*32+1*WIN64)*16 + jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 + +cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \ + 0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 2, 0, 6 + mov [rsp+gprsize*1+(64*9+8)*16], r0 + mov [rsp+gprsize*2+(64*9+8)*16], r1 + mov [rsp+gprsize*3+(64*9+8)*16], r2 + mov [rsp+gprsize*4+(64*9+8)*16], r6 +%else + DECLARE_REG_TMP 8, 9, 4, 7, 0 + mov [rsp+gprsize*1+(64*9+1)*16], r9 + mov [rsp+gprsize*0+64*16], r0 +%if WIN64 + mov [rsp+gprsize*2+(64*9+1)*16], r7 + mov [rsp+gprsize*3+(64*9+1)*16], r8 +%endif +%endif +%undef cmp + + ; remove entirely-zero iterations + mov r5d, 14 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 + lea t4, [rsp+7*64*16] +.zero_loop_inner: + mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0 + mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0 + mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0 + mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0 + sub t4, 64*16 + cmp t4, rsp + jge .zero_loop_inner +%if ARCH_X86_32 + mov r6, [rsp+gprsize*4+(64*9+8)*16] +%endif + sub r5d, 2 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd +%if ARCH_X86_32 + mov cq, [rsp+gprsize*3+(64*9+8)*16] +%endif + ; actual first pass after skipping all-zero data +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + + mov r3, rsp + lea r4, [o(idct64_mul_16bpc)] + mova m0, [cq+128* 1+r5*8] + mova m1, [cq+128*31+r5*8] + mova m2, [cq+128*17+r5*8] + mova m3, [cq+128*15+r5*8] + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 7+r5*8] + mova m1, [cq+128*25+r5*8] + mova m2, [cq+128*23+r5*8] + mova m3, [cq+128* 9+r5*8] + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 5+r5*8] + mova m1, [cq+128*27+r5*8] + mova m2, [cq+128*21+r5*8] + mova m3, [cq+128*11+r5*8] + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 3+r5*8] + mova m1, [cq+128*29+r5*8] + mova m2, [cq+128*19+r5*8] + mova m3, [cq+128*13+r5*8] + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 + + mova m0, [cq+128* 2+r5*8] + mova m1, [cq+128*14+r5*8] + mova m2, [cq+128*18+r5*8] + mova m3, [cq+128*30+r5*8] + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast + + mova m0, [cq+128* 6+r5*8] + mova m1, [cq+128*10+r5*8] + mova m2, [cq+128*22+r5*8] + mova m3, [cq+128*26+r5*8] + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast + add r3, 16*(24+4*ARCH_X86_32) + + mova m0, [cq+128* 4+r5*8] + mova m1, [cq+128*12+r5*8] + mova m2, [cq+128*20+r5*8] + mova m3, [cq+128*28+r5*8] + call m(idct_16x4_internal_16bpc).main_oddhalf_fast + + mova m0, [cq+128* 0+r5*8] + mova m1, [cq+128* 8+r5*8] + mova m2, [cq+128*16+r5*8] + mova m3, [cq+128*24+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1_fast + call m(idct_8x4_internal_16bpc).round + mova [r3-(7+4*ARCH_X86_32)*16], m1 + mova [r3-(6+4*ARCH_X86_32)*16], m2 + mova [r3-(5+4*ARCH_X86_32)*16], m3 + mova [r3-(4+4*ARCH_X86_32)*16], m4 + mova [r3-(3+4*ARCH_X86_32)*16], m5 + mova [r3-(2+4*ARCH_X86_32)*16], m6 + mova [r3-(1+4*ARCH_X86_32)*16], m7 + sub r3, 16*(40+4*ARCH_X86_32-4) + +%if ARCH_X86_64 + psrld m15, m11, 10 ; pd_2 +%else + mova m7, [o(pd_2)] +%endif + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start + + lea r3, [rsp+56*16] + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 + lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16] + call .shift_transpose + ; zero cq + pxor m7, m7 +%if ARCH_X86_32 + mov cq, [rsp+gprsize*3+(64*9+8)*16] +%endif + lea r4, [cq+30*128+r5*8] +.zero_cq_loop: + REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 + sub r4, 4*128 + cmp r4, cq + jg .zero_cq_loop +%if ARCH_X86_32 + mov r6, [rsp+gprsize*4+(64*9+8)*16] +%endif + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 code starts here + mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16] +%if ARCH_X86_32 + mov strideq, [rsp+gprsize*2+(9*64+8)*16] +%else + mov r0, [rsp+gprsize*0+64*16] +%endif + add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16 + cmp eobd, 151 + jl .fast + ; fall-through +%if ARCH_X86_64 + DECLARE_REG_TMP 8, 9 +%else + DECLARE_REG_TMP 1, 5 +%endif + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] + jmp .run +.fast: + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] +.run: + +%if ARCH_X86_64 + lea r2, [dstq+128] + mov r7, -16 +%else + lea r2, [rsp+(64*8+3)*16] + mov [r2+4*gprsize], t0 + mov [r2+5*gprsize], t1 + mov r1, [r2+2*gprsize] + mov dword [r2+0*gprsize], 8 +%endif + jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 + + ; copy of pass=1 tmp-regs +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 2, 0, 6 +%else + DECLARE_REG_TMP 8, 9, 4, 7, 0 +%endif + +.shift_transpose: + mova m0, [r3+0*16] + mova m1, [r3+1*16] + mova m2, [r3+2*16] + mova m3, [r3+3*16] + mova m4, [r3+4*16] + mova m5, [r3+5*16] + mova m6, [r3+6*16] + mova m7, [r3+7*16] + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [t4+t0*8], m0 + mova [t4+t1*8], m1 + mova [t4+t2*8], m2 + mova [t4+t3*8], m3 + sub t4, 16*64 + sub r3, 8*16 + cmp r3, rsp + jg .shift_transpose + ret + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 64 + add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \ + (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16 + jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1 diff --git a/third_party/dav1d/src/x86/itx_avx2.asm b/third_party/dav1d/src/x86/itx_avx2.asm new file mode 100644 index 0000000000..a67f053a61 --- /dev/null +++ b/third_party/dav1d/src/x86/itx_avx2.asm @@ -0,0 +1,5542 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 + +; Note: The order of (at least some of) those constants matter! + +const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +%macro COEF_PAIR 2 +pw_%1_%2: dw %1, %2 +pw_m%2_%1: dw -%2, %1 +%endmacro + +; ADST-only +pw_3803_1321: dw 3803, 1321 +pw_m1321_2482: dw -1321, 2482 +pw_2482_3344: dw 2482, 3344 +pw_m3344_3344: dw -3344, 3344 +pw_m3803_3344: dw -3803, 3344 +pw_m3803_m6688: dw -3803, -6688 +pw_2896_m2896: dw 2896, -2896 + +const pw_5, times 2 dw 5 +const pw_2048, times 2 dw 2048 +const pw_4096, times 2 dw 4096 +const pw_8192, times 2 dw 8192 +const pw_16384, times 2 dw 16384 +const pw_1697x16, times 2 dw 1697*16 +const pw_1697x8, times 2 dw 1697*8 +const pw_2896x8, times 2 dw 2896*8 +const pd_2048, dd 2048 + +const pw_2896_2896, dw 2896, 2896 +const pw_m2896_2896, dw -2896, 2896 +const pw_1567_3784, dw 1567, 3784 +const pw_m3784_1567, dw -3784, 1567 +COEF_PAIR 3784, 1567 +COEF_PAIR 201, 4091 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 2440, 3290 +COEF_PAIR 3035, 2751 +COEF_PAIR 3513, 2106 +COEF_PAIR 3857, 1380 +COEF_PAIR 4052, 601 +COEF_PAIR 401, 4076 +COEF_PAIR 1931, 3612 +COEF_PAIR 3166, 2598 +COEF_PAIR 3920, 1189 +COEF_PAIR 799, 4017 +COEF_PAIR 3406, 2276 +pw_m799_m4017: dw -799, -4017 +const pw_m1567_m3784, dw -1567, -3784 +pw_m3406_m2276: dw -3406, -2276 +pw_m401_m4076: dw -401, -4076 +pw_m3166_m2598: dw -3166, -2598 +pw_m1931_m3612: dw -1931, -3612 +pw_m3920_m1189: dw -3920, -1189 +COEF_PAIR 2276, 3406 +COEF_PAIR 4017, 799 + +%macro COEF_X8 1-* +%rep %0 + dw %1*8, %1*8 + %rotate 1 +%endrep +%endmacro + +pw_3703x8: COEF_X8 3703 +pw_1751x8: COEF_X8 1751 +pw_m1380x8: COEF_X8 -1380 +pw_3857x8: COEF_X8 3857 +pw_3973x8: COEF_X8 3973 +pw_995x8: COEF_X8 995 +pw_m2106x8: COEF_X8 -2106 +pw_3513x8: COEF_X8 3513 +pw_3290x8: COEF_X8 3290 +pw_2440x8: COEF_X8 2440 +pw_m601x8: COEF_X8 -601 +pw_4052x8: COEF_X8 4052 + +const idct64_mul +COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 +COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 +COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 +COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 + +pw_201_4091x8: dw 201*8, 4091*8 +pw_m601_4052x8: dw -601*8, 4052*8 +pw_995_3973x8: dw 995*8, 3973*8 +pw_m1380_3857x8: dw -1380*8, 3857*8 +pw_1751_3703x8: dw 1751*8, 3703*8 +pw_m2106_3513x8: dw -2106*8, 3513*8 +pw_2440_3290x8: dw 2440*8, 3290*8 +pw_m2751_3035x8: dw -2751*8, 3035*8 + +%define o_idct64_offset idct64_mul - (o_base) - 8 + +SECTION .text + +; Code size reduction trickery: Instead of using rip-relative loads with +; mandatory 4-byte offsets everywhere, we can set up a base pointer with a +; single rip-relative lea and then address things relative from that with +; 1-byte offsets as long as data is within +-128 bytes of the base pointer. +%define o_base deint_shuf + 128 +%define o(x) (r6 - (o_base) + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; flags: 1 = swap, 2 = interleave, 4: coef_regs +%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags +%if %7 & 4 + pmaddwd m%2, m%5, m%1 + pmaddwd m%1, m%6 +%else +%if %7 & 1 + vpbroadcastd m%2, [o(pw_%5_%6)] + vpbroadcastd m%3, [o(pw_m%6_%5)] +%else + vpbroadcastd m%2, [o(pw_m%6_%5)] + vpbroadcastd m%3, [o(pw_%5_%6)] +%endif + pmaddwd m%2, m%1 + pmaddwd m%1, m%3 +%endif + paddd m%2, m%4 + paddd m%1, m%4 +%if %7 & 2 + pslld m%2, 4 + psrld m%1, 12 + pblendw m%1, m%2, 0xaa +%else + psrad m%2, 12 + psrad m%1, 12 + packssdw m%1, m%2 +%endif +%endmacro + +; flags: 1 = swap, 2 = interleave, 4 = coef_regs +%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags +%if %10 & 1 + vpbroadcastd m%3, [o(pw_%8_%9)] + vpbroadcastd m%4, [o(pw_m%9_%8)] + vpbroadcastd xm%2, [o(pw_%6_%7)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(pw_m%7_%6)] +%else + vpbroadcastd m%3, [o(pw_m%9_%8)] + vpbroadcastd m%4, [o(pw_%8_%9)] + vpbroadcastd xm%2, [o(pw_m%7_%6)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(pw_%6_%7)] +%endif + vpblendd m%3, m%4, 0xf0 + ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) +%endmacro + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 + punpckhwd m%3, m%2, m%1 + punpcklwd m%2, m%1 +%if %7 < 32 + pmaddwd m%1, m%7, m%2 + pmaddwd m%4, m%7, m%3 +%else + vpbroadcastd m%1, [o(pw_m%7_%6)] + pmaddwd m%4, m%3, m%1 + pmaddwd m%1, m%2 +%endif + paddd m%4, m%5 + paddd m%1, m%5 + psrad m%4, 12 + psrad m%1, 12 + packssdw m%1, m%4 +%if %7 < 32 + pmaddwd m%3, m%6 + pmaddwd m%2, m%6 +%else + vpbroadcastd m%4, [o(pw_%6_%7)] + pmaddwd m%3, m%4 + pmaddwd m%2, m%4 +%endif + paddd m%3, m%5 + paddd m%2, m%5 + psrad m%3, 12 + psrad m%2, 12 +%if %0 == 8 + packssdw m%8, m%2, m%3 +%else + packssdw m%2, m%3 +%endif +%endmacro + +%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3 + ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0 + psubsw m%3, m%1, m%2 + paddsw m%2, m%1 + paddsw m%1, m%4, m%5 + psubsw m%4, m%5 +%endmacro + +%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048 + ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a + ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a + ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3 + paddsw m%9, m%2, m%6 ; t4 + psubsw m%2, m%6 ; t5a + paddsw m%10, m%8, m%4 ; t7 + psubsw m%8, m%4 ; t6a + ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0 + ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6 + psubsw m%6, m%1, m%3 ; dct4 out2 + paddsw m%3, m%1 ; dct4 out1 + paddsw m%1, m%5, m%7 ; dct4 out0 + psubsw m%5, m%7 ; dct4 out3 + psubsw m%7, m%3, m%2 ; out6 + paddsw m%2, m%3 ; out1 + paddsw m%3, m%6, m%8 ; out2 + psubsw m%6, m%8 ; out5 + psubsw m%8, m%1, m%10 ; out7 + paddsw m%1, m%10 ; out0 + paddsw m%4, m%5, m%9 ; out3 + psubsw m%5, m%9 ; out4 +%endmacro + +; in1 = %1, in3 = %2, in5 = %3, in7 = %4 +; in9 = %5, in11 = %6, in13 = %7, in15 = %8 +%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048 + ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a + ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a + psubsw m%9, m%2, m%6 ; t13 + paddsw m%6, m%2 ; t12 + psubsw m%2, m%8, m%4 ; t14 + paddsw m%8, m%4 ; t15 + psubsw m%4, m%7, m%3 ; t10 + paddsw m%3, m%7 ; t11 + psubsw m%7, m%1, m%5 ; t9 + paddsw m%1, m%5 ; t8 + ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a + ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a + psubsw m%5, m%1, m%3 ; t11a + paddsw m%1, m%3 ; t8a + psubsw m%3, m%7, m%4 ; t13 + paddsw m%7, m%4 ; t14 + psubsw m%4, m%8, m%6 ; t12a + paddsw m%8, m%6 ; t15a + psubsw m%6, m%2, m%9 ; t10 + paddsw m%2, m%9 ; t9 + ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a + ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12 +%endmacro + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + vpbroadcastd m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + movd m2, [%%row_adr1] + pinsrd m2, [%%row_adr2], 1 + movd m3, [%%row_adr3] + pinsrd m3, [%%row_adr4], 1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + movd [%%row_adr1], m0 + pextrd [%%row_adr2], m0, 1 + pextrd [%%row_adr3], m0, 2 + pextrd [%%row_adr4], m0, 3 + ret +%endmacro + +%macro IWHT4_1D_PACKED 0 + punpckhqdq m3, m0, m1 ; in1 in3 + punpcklqdq m0, m1 ; in0 in2 + psubw m2, m0, m3 + paddw m0, m3 + punpckhqdq m2, m2 ; t2 t2 + punpcklqdq m0, m0 ; t0 t0 + psubw m1, m0, m2 + psraw m1, 1 + psubw m1, m3 ; t1 t3 + psubw m0, m1 ; ____ out0 + paddw m2, m1 ; out3 ____ +%endmacro + +INIT_XMM avx2 +cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c + mova m0, [cq+16*0] + mova m1, [cq+16*1] + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + psraw m0, 2 + psraw m1, 2 + IWHT4_1D_PACKED + punpckhwd m0, m1 + punpcklwd m3, m1, m2 + punpckhdq m1, m0, m3 + punpckldq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x03 + ITX4_END 3, 0, 2, 1, 0 + +%macro INV_TXFM_FN 3 ; type1, type2, size +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%3_internal_8bpc) + lea r6, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%3_internal_8bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4 +%ifidn %1_%2, dct_dct + vpbroadcastw m0, [cq] + vpbroadcastd m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [cq], eobd ; 0 + pmulhrsw m0, m1 + mova m1, m0 + jmp m(iadst_4x4_internal_8bpc).end2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0 + vpbroadcastd m4, [o(pd_2048)] + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 + ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 +%endmacro + +%macro IADST4_1D_PACKED 0 + punpcklwd m2, m1, m0 + punpckhwd m3, m1, m0 + vpbroadcastd m5, [o(pw_m3344_3344)] + vpbroadcastd m0, [o(pw_3803_1321)] + vpbroadcastd m4, [o(pw_m1321_2482)] + pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 + psrld m5, 16 + pmaddwd m0, m2 + pmaddwd m2, m4 + pmaddwd m5, m3 ; 3344*in0 + paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 + vpbroadcastd m4, [o(pw_2482_3344)] + vpbroadcastd m5, [o(pw_m3803_3344)] + pmaddwd m4, m3 + pmaddwd m5, m3 + paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 + vpbroadcastd m0, [o(pw_m3803_m6688)] + pmaddwd m3, m0 + vpbroadcastd m0, [o(pd_2048)] + paddd m2, m0 + paddd m1, m0 + paddd m0, m4 + paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 + paddd m2, m4 + paddd m2, m3 + REPX {psrad x, 12}, m1, m2, m0, m5 + packssdw m0, m5 ; out0 out1 + packssdw m1, m2 ; out2 out3 +%endmacro + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + IDCT4_1D_PACKED + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call .main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 0, 1, 2, 3 +ALIGN function_align +cglobal_label .main + IADST4_1D_PACKED + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call m(iadst_4x4_internal_8bpc).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x4_internal_8bpc).main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8bpc).end + +%macro WRITE_4X8 2 ; coefs[1-2] + movd xm4, [dstq+strideq*0] + pinsrd xm4, [dstq+strideq*1], 1 + movd xm5, [dstq+strideq*2] + pinsrd xm5, [dstq+r3 ], 1 + pinsrd xm4, [r2 +strideq*0], 2 + pinsrd xm4, [r2 +strideq*1], 3 + pinsrd xm5, [r2 +strideq*2], 2 + pinsrd xm5, [r2 +r3 ], 3 + pmovzxbw m4, xm4 + pmovzxbw m5, xm5 + paddw m4, m%1 + paddw m5, m%2 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+strideq*2], xm4, 2 + pextrd [dstq+r3 ], xm4, 3 + movd [r2 +strideq*0], xm5 + pextrd [r2 +strideq*1], xm5, 1 + pextrd [r2 +strideq*2], xm5, 2 + pextrd [r2 +r3 ], xm5, 3 +%endmacro + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_4x8_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT8_1D_PACKED 0 + vpbroadcastd m6, [o(pd_2048)] + punpckhwd m5, m3, m0 ; in7 in1 + punpckhwd m4, m1, m2 ; in3 in5 + punpcklwd m3, m1 ; in6 in2 + punpcklwd m2, m0 ; in4 in0 + ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 + psubsw m0, m5, m4 ; t5a t6a (interleaved) + paddsw m4, m5 ; t4 t7 (interleaved) + ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 + vpbroadcastd m1, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 +%if mmsize > 16 + vbroadcasti128 m1, [o(deint_shuf)] + pshufb m4, m1 +%else + pshufb m4, [o(deint_shuf)] +%endif + psubsw m1, m2, m3 ; tmp3 tmp2 + paddsw m3, m2 ; tmp0 tmp1 + shufps m2, m4, m0, q1032 ; t7 t6 + vpblendd m4, m0, 0xcc ; t4 t5 + paddsw m0, m3, m2 ; out0 out1 + psubsw m3, m2 ; out7 out6 + psubsw m2, m1, m4 ; out4 out5 + paddsw m1, m4 ; out3 out2 +%endmacro + +%macro IADST8_1D_PACKED 1 ; pass + vpbroadcastd m6, [o(pd_2048)] + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 +%if %1 == 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a + psubsw m4, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a +%if mmsize > 16 + vbroadcasti128 m2, [o(deint_shuf)] +%else + mova m2, [o(deint_shuf)] +%endif + pshuflw m1, m1, q2301 + pshufhw m1, m1, q2301 + psubsw m3, m0, m1 ; t3 t2 + paddsw m0, m1 ; -out7 out0 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + pshufb m0, m2 + pshufb m4, m2 + vpbroadcastd m5, [o(pw_m2896_2896)] + pmaddwd m2, m5, m3 + pmaddwd m5, m1 + paddd m2, m6 + paddd m5, m6 + psrad m2, 12 + psrad m5, 12 + packssdw m2, m5 ; out4 -out5 + vpbroadcastd m5, [o(pw_2896_2896)] + pmaddwd m3, m5 + pmaddwd m1, m5 + paddd m3, m6 + paddd m1, m6 + psrad m3, 12 + psrad m1, 12 + packssdw m1, m3 ; out2 -out3 + punpcklqdq m3, m4, m0 ; out6 -out7 + punpckhqdq m0, m4 ; out0 -out1 +%else + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a + psubsw m4, m0, m2 ; t4 t5 + paddsw m0, m2 ; t0 t1 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + shufps m2, m5, m4, q1032 + punpckhwd m4, m2 + punpcklwd m5, m2 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a + psubsw m2, m0, m1 ; t2 t3 + paddsw m0, m1 ; out0 -out7 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + vpbroadcastd m5, [o(pw_2896x8)] + vpblendd m3, m0, m4, 0x33 ; out6 -out7 + vpblendd m0, m4, 0xcc ; out0 -out1 + shufps m4, m2, m1, q1032 ; t3 t7 + vpblendd m1, m2, 0x33 ; t2 t6 + psubsw m2, m1, m4 ; t2-t3 t6-t7 + paddsw m1, m4 ; t2+t3 t6+t7 + pmulhrsw m2, m5 ; out4 -out5 + pshufd m1, m1, q1032 + pmulhrsw m1, m5 ; out2 -out3 +%endif +%endmacro + +INIT_YMM avx2 +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst +INV_TXFM_4X8_FN dct, identity + +cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + IDCT4_1D_PACKED + vbroadcasti128 m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + call .main + vpbroadcastd m4, [o(pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pshufd m1, m1, q1032 + jmp m(iadst_4x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + WRAP_XMM IDCT8_1D_PACKED + ret + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call .main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pxor m5, m5 + psubw m5, m4 +.end: + vpblendd m4, m5, 0xcc +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 + WIN64_RESTORE_XMM + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 +.end3: + lea r2, [dstq+strideq*4] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + RET +ALIGN function_align +.main_pass1: + WRAP_XMM IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + WRAP_XMM IADST8_1D_PACKED 2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call m(iadst_4x8_internal_8bpc).main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + pxor m4, m4 + psubw m4, m5 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + jmp m(iadst_4x8_internal_8bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m2, [cq+32*0], q3120 + vpermq m0, [cq+32*1], q3120 + vpbroadcastd m3, [o(pw_2896x8)] + vpbroadcastd m4, [o(pw_1697x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + pmulhrsw m2, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m2 + paddsw m1, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_4x8_internal_8bpc).end2 + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + movd xm3, [o(pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm2 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm3 + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp m(iadst_4x16_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT16_1D_PACKED 0 + vpbroadcastd m10, [o(pd_2048)] +.main2: + punpckhwd m8, m7, m0 ; dct16 in15 in1 + punpcklwd m9, m4, m0 ; dct4 in2 in0 + punpckhwd m0, m3, m4 ; dct16 in7 in9 + punpcklwd m7, m1 ; dct8 in7 in1 + punpckhwd m1, m6 ; dct16 in3 in13 + punpcklwd m3, m5 ; dct8 in3 in5 + punpckhwd m5, m2 ; dct16 in11 in5 + punpcklwd m6, m2 ; dct4 in3 in1 + ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a + ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a + ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a + ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a + ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2 + psubsw m2, m8, m0 ; t9 t14 + paddsw m8, m0 ; t8 t15 + psubsw m0, m1, m5 ; t10 t13 + paddsw m1, m5 ; t11 t12 + vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784 + ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a + vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567 + ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a + psubsw m4, m8, m1 ; t11a t12a + paddsw m8, m1 ; t8a t15a + psubsw m1, m7, m3 ; t5a t6a + paddsw m7, m3 ; t4 t7 + paddsw m3, m2, m0 ; t9 t14 + psubsw m2, m0 ; t10 t13 +%if mmsize > 16 + vbroadcasti128 m0, [o(deint_shuf)] +%else + mova m0, [o(deint_shuf)] +%endif + pshufb m8, m0 + pshufb m7, m0 + pshufb m3, m0 + ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1 + vpbroadcastd m0, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 + vpbroadcastd m5, [o(pw_2896_2896)] + ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 + vpbroadcastd m0, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a + punpckhqdq m0, m8, m3 ; t15a t14 + punpcklqdq m8, m3 ; t8a t9 + shufps m5, m4, m2, q1032 ; t12 t13a + vpblendd m4, m2, 0xcc ; t11 t10a + shufps m2, m7, m1, q1032 ; t7 t6 + vpblendd m7, m1, 0xcc ; t4 t5 + psubsw m1, m9, m6 ; dct4 out3 out2 + paddsw m9, m6 ; dct4 out0 out1 + psubsw m3, m9, m2 ; dct8 out7 out6 + paddsw m9, m2 ; dct8 out0 out1 + psubsw m2, m1, m7 ; dct8 out4 out5 + paddsw m1, m7 ; dct8 out3 out2 + psubsw m7, m9, m0 ; out15 out14 + paddsw m0, m9 ; out0 out1 + psubsw m6, m1, m5 ; out12 out13 + paddsw m1, m5 ; out3 out2 + psubsw m5, m2, m4 ; out11 out10 + paddsw m2, m4 ; out4 out5 + psubsw m4, m3, m8 ; out8 out9 + paddsw m3, m8 ; out7 out6 +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst +INV_TXFM_4X16_FN dct, identity + +cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(idct_16x4_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m0, m4, m2, m3 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vextracti128 xm4, m0, 1 + vextracti128 xm5, m1, 1 + vextracti128 xm6, m2, 1 + vextracti128 xm7, m3, 1 + call .main + vinserti128 m0, xm4, 1 + vinserti128 m1, xm5, 1 + vpbroadcastd m5, [o(pw_2048)] + vinserti128 m2, xm6, 1 + vinserti128 m3, xm7, 1 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x16_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + WRAP_XMM IDCT16_1D_PACKED + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m4, m2, m3, m0 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m5, [o(pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m1, m0, 0x33 + vpblendd m0, m2, 0x33 + vpblendd m2, m3, 0x33 + vpblendd m3, m1, 0x33 + vpermq m0, m0, q2031 + vpermq m1, m2, q1302 + vpermq m2, m3, q3120 + vpermq m3, m4, q0213 + psubw m6, m7, m5 +.end: + vpblendd m5, m6, 0xcc +.end2: + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + WIN64_RESTORE_XMM + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + lea r2, [dstq+strideq*8] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + lea dstq, [dstq+strideq*4] + lea r2, [r2 +strideq*4] + WRITE_4X8 2, 3 + RET +ALIGN function_align +.main: + vpblendd m4, m1, m0, 0xcc + vpblendd m1, m0, 0x33 + vpblendd m5, m2, m3, 0xcc + vpblendd m2, m3, 0x33 + vperm2i128 m3, m5, m2, 0x31 + vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 + vperm2i128 m4, m1, m4, 0x31 + vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 + pshufd m3, m3, q1032 ; in15 in12 in13 in14 + pshufd m2, m4, q1032 ; in11 in8 in9 in10 +cglobal_label .main2 + vpbroadcastd m8, [o(pd_2048)] + pxor m7, m7 + punpckhwd m4, m3, m0 ; in12 in3 in14 in1 + punpcklwd m0, m3 ; in0 in15 in2 in13 + punpckhwd m3, m2, m1 ; in8 in7 in10 in5 + punpcklwd m1, m2 ; in4 in11 in6 in9 + ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 + ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 + ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 + ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 + psubsw m2, m0, m3 ; t9a t8a t11a t10a + paddsw m0, m3 ; t1a t0a t3a t2a + psubsw m3, m1, m4 ; t13a t12a t15a t14a + paddsw m1, m4 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3 + psubw m6, m7, m5 + ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 + vpbroadcastd m6, [o(pw_m3784_1567)] + vpbroadcastd m5, [o(pw_1567_3784)] + psubsw m4, m0, m1 ; t5 t4 t7 t6 + paddsw m0, m1 ; t1 t0 t3 t2 + psubsw m1, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + psubw m3, m7, m6 ; pw_3784_m1567 + vpblendd m6, m3, 0xf0 + ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a + ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 + vbroadcasti128 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a + vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a + vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 + vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 + pshufd m2, m2, q1032 ; t6a t7a t14 t15 + psubsw m1, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m4, m2 ; -out3 out12 out2 -out13 + psubsw m4, m2 ; t6 t7 t14a t15a + shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a + vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m5, [o(pw_m2896_2896)] + vpbroadcastd m6, [o(pw_2896_2896)] + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + pmaddwd m2, m5, m4 + pmaddwd m4, m6 + pmaddwd m5, m1 + pmaddwd m1, m6 + REPX {paddd x, m8}, m5, m1, m2, m4 + REPX {psrad x, 12}, m5, m2, m1, m4 + packssdw m2, m5 ; -out11 out8 out10 -out9 + packssdw m1, m4 ; -out7 out4 out6 -out5 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpcklwd m4, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m3, m2 + punpckhwd m3, m2 + REPX {pmulhrsw x, m5}, m4, m1, m0, m3 + punpckldq m2, m3, m1 + punpckhdq m3, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_8bpc).main + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m6, [o(pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m0, m2, 0x33 + vpblendd m0, m1, 0xcc + vpblendd m1, m3, 0xcc + vpblendd m2, m3, 0x33 + vpermq m0, m0, q3120 + vpermq m1, m1, q0213 + vpermq m2, m2, q2031 + vpermq m3, m4, q1302 + psubw m5, m7, m6 + jmp m(iadst_4x16_internal_8bpc).end + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m3, [cq+32*0] + mova m2, [cq+32*1] + mova m4, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m8, [o(pw_1697x8)] + pcmpeqw m0, m0 ; -1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 + pmulhrsw m8, m4 + pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is + pxor m1, m9 ; unsigned. as long as both signs are equal + pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the + pxor m2, m9 ; pmulhrsw result will become 0 which causes + pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless + pxor m3, m9 ; we explicitly deal with that case here. + pcmpeqw m0, m4 + pxor m4, m0 + pavgw m1, m5 + pavgw m2, m6 + pavgw m3, m7 + pavgw m4, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + vpbroadcastd m5, [o(pw_2048)] + pmulhrsw m4, m8, m0 + pmulhrsw m6, m8, m1 + pmulhrsw m7, m8, m2 + pmulhrsw m8, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m6 + paddsw m2, m7 + paddsw m3, m8 + jmp m(iadst_4x16_internal_8bpc).end2 + +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] + movq xm%3, [dstq ] + movhps xm%3, [dstq+%5] + movq xm%4, [dstq+%6] + movhps xm%4, [dstq+%7] + pmovzxbw m%3, xm%3 + pmovzxbw m%4, xm%4 +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vextracti128 xm%4, m%3, 1 + movq [dstq ], xm%3 + movhps [dstq+%6], xm%3 + movq [dstq+%5], xm%4 + movhps [dstq+%7], xm%4 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + mov [cq], eobd + pmulhrsw xm0, xm1 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm3, [o(pw_2896x8)] + pmulhrsw xm0, xm3, [cq+16*0] + pmulhrsw xm1, xm3, [cq+16*1] + pmulhrsw xm2, xm3, [cq+16*2] + pmulhrsw xm3, [cq+16*3] + call m(idct_4x8_internal_8bpc).main + vbroadcasti128 m4, [o(deint_shuf)] + vinserti128 m3, m1, xm3, 1 + vinserti128 m1, m0, xm2, 1 + shufps m0, m1, m3, q0220 + shufps m1, m3, q1331 + pshufb m0, m4 + pshufb m1, m4 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pxor m3, m3 + psubsw m3, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +.end2: + vpbroadcastd m2, [o(pw_2048)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + WIN64_RESTORE_XMM +.end3: + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + RET +ALIGN function_align +cglobal_label .main + IADST4_1D_PACKED + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + punpckhwd m1, m3, m2 + punpcklwd m3, m2 + pxor m0, m0 + psubsw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_8bpc).main + mova m2, m1 + vpermq m1, m0, q2031 + vpermq m0, m2, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm0, [cq+16*1] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m0, [cq+16*3], 1 + vpbroadcastd m3, [o(pw_2896x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + paddsw m0, m0 + paddsw m1, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_8x4_internal_8bpc).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + or r3d, 8 +.dconly: + pmulhrsw xm0, xm2 +.dconly2: + movd xm2, [pw_2048] + pmulhrsw xm0, xm1 + lea r2, [strideq*3] + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 +.dconly_loop: + WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2 + lea dstq, [dstq+strideq*4] + sub r3d, 4 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m1, [cq+32*1], q3120 ; 2 3 + call .main + shufps m4, m0, m1, q0220 + shufps m5, m0, m1, q1331 + shufps m1, m2, m3, q0220 + shufps m3, m2, m3, q1331 + vbroadcasti128 m0, [o(deint_shuf)] + vpbroadcastd m2, [o(pw_16384)] + REPX {pshufb x, m0}, m4, m5, m1, m3 + REPX {pmulhrsw x, m2}, m4, m5, m1, m3 + vinserti128 m0, m4, xm1, 1 + vperm2i128 m2, m4, m1, 0x31 + vinserti128 m1, m5, xm3, 1 + vperm2i128 m3, m5, m3, 0x31 + jmp tx2q +.pass2: + call .main + vpbroadcastd m4, [o(pw_2048)] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + jmp m(iadst_8x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + IDCT8_1D_PACKED + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call .main_pass1 + vpbroadcastd m5, [o(pw_16384)] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + pxor m3, m3 + psubw m3, m5 ; negate odd elements during rounding + pmulhrsw m4, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m5 + pmulhrsw m2, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + vperm2i128 m2, m3, m0, 0x31 + vinserti128 m0, m3, xm0, 1 + vperm2i128 m3, m4, m1, 0x31 + vinserti128 m1, m4, xm1, 1 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call .main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vpbroadcastd xm4, [o(pw_4096)] + psubw m4, m5 ; lower half = 2048, upper half = -2048 +.end: + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + pmulhrsw m2, m4 + pmulhrsw m3, m4 + WIN64_RESTORE_XMM +.end4: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 5 + RET +ALIGN function_align +.main_pass1: + IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + IADST8_1D_PACKED 2 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call m(iadst_8x8_internal_8bpc).main_pass1 + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + pxor m0, m0 + psubw m0, m5 + pmulhrsw m4, m0 + pmulhrsw m3, m5 + pmulhrsw m2, m0 + pmulhrsw m1, m5 + punpckhwd m0, m4, m3 + punpcklwd m4, m3 + punpckhwd m3, m2, m1 + punpcklwd m2, m1 + vinserti128 m1, m0, xm3, 1 + vperm2i128 m3, m0, m3, 0x31 + vinserti128 m0, m4, xm2, 1 + vperm2i128 m2, m4, m2, 0x31 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vpbroadcastd xm5, [o(pw_4096)] + psubw m4, m5 ; lower half = -2048, upper half = 2048 + vpermq m5, m3, q2031 + vpermq m3, m0, q2031 + vpermq m0, m2, q2031 + vpermq m2, m1, q2031 + pmulhrsw m1, m0, m4 + pmulhrsw m0, m5, m4 + jmp m(iadst_8x8_internal_8bpc).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*1] + vinserti128 m3, [cq+16*4], 1 + vinserti128 m2, [cq+16*5], 1 + mova xm4, [cq+16*2] + mova xm0, [cq+16*3] + vinserti128 m4, [cq+16*6], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_8x8_internal_8bpc).end + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 16 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly +%endif +%endmacro + +%macro ITX_8X16_LOAD_COEFS 0 + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m0, m4, [cq+32*0] + add cq, 32*4 + pmulhrsw m7, m4, [cq+32*3] + pmulhrsw m1, m4, [cq-32*3] + pmulhrsw m6, m4, [cq+32*2] + pmulhrsw m2, m4, [cq-32*2] + pmulhrsw m5, m4, [cq+32*1] + pmulhrsw m3, m4, [cq-32*1] + pmulhrsw m4, [cq+32*0] +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, identity + +cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m10, [o(pw_16384)] +.pass1_end: + vperm2i128 m9, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vperm2i128 m8, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 +.pass1_end2: + punpckhwd m7, m5, m6 + punpcklwd m5, m6 + punpcklwd m6, m8, m9 + punpckhwd m8, m9 + REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + call .main + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + vpbroadcastd m8, [o(pw_2048)] +.end2: + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +.end3: + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 8, 9 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 4, 5, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 6, 7, 0, 1 + RET +ALIGN function_align +cglobal_label .main + IDCT16_1D_PACKED + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + vpbroadcastd m10, [o(pw_16384)] + pslld m9, m10, 17 + psubw m10, m9 ; 16384, -16384 + jmp m(idct_8x16_internal_8bpc).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + vpbroadcastd m9, [o(pw_2048)] + vpbroadcastd xm8, [o(pw_4096)] + psubw m8, m9 + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + jmp m(idct_8x16_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + REPX {pshufd x, x, q1032}, m7, m1, m5, m3 +.main2: + vpbroadcastd m10, [o(pd_2048)] + punpckhwd m8, m7, m0 ; in14 in1 + punpcklwd m0, m7 ; in0 in15 + punpcklwd m7, m6, m1 ; in12 in3 + punpckhwd m1, m6 ; in2 in13 + punpckhwd m6, m5, m2 ; in10 in5 + punpcklwd m2, m5 ; in4 in11 + punpcklwd m5, m4, m3 ; in8 in7 + punpckhwd m3, m4 ; in6 in9 + ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 + ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 + ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 + ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 + ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 + ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 + ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 + psubsw m4, m0, m5 ; t9a t8a + paddsw m0, m5 ; t1a t0a + psubsw m5, m1, m6 ; t11a t10a + paddsw m1, m6 ; t3a t2a + psubsw m6, m2, m7 ; t13a t12a + paddsw m2, m7 ; t5a t4a + psubsw m7, m3, m8 ; t15a t14a + paddsw m3, m8 ; t7a t6a + vpbroadcastd m11, [o(pw_m4017_799)] + vpbroadcastd m12, [o(pw_799_4017)] + pxor m9, m9 + ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 + psubw m8, m9, m11 ; pw_4017_m799 + ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 + vpbroadcastd m11, [o(pw_m2276_3406)] + vpbroadcastd m12, [o(pw_3406_2276)] + ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 + psubw m8, m9, m11 ; pw_2276_m3406 + ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 + psubsw m8, m1, m3 ; t7 t6 + paddsw m1, m3 ; t3 t2 + psubsw m3, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m2, m5, m7 ; t14a t15a + paddsw m7, m5 ; t10a t11a + psubsw m5, m4, m6 ; t12a t13a + paddsw m4, m6 ; t8a t9a + vpbroadcastd m11, [o(pw_m3784_1567)] + vpbroadcastd m12, [o(pw_1567_3784)] + ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a + psubw m6, m9, m11 ; pw_3784_m1567 + ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a + vpbroadcastd m11, [o(pw_m1567_3784)] + vpbroadcastd m12, [o(pw_3784_1567)] + ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 + psubw m6, m9, m11 ; pw_1567_m3784 + ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 + vbroadcasti128 m12, [o(deint_shuf)] + paddsw m6, m4, m7 ; -out1 out14 + psubsw m4, m7 ; t10 t11 + psubsw m11, m3, m8 ; t7 t6 + paddsw m8, m3 ; out12 -out3 + psubsw m3, m0, m1 ; t3a t2a + paddsw m0, m1 ; -out15 out0 + paddsw m1, m2, m5 ; -out13 out2 + psubsw m5, m2 ; t15a t14a + pshufb m0, m12 + pshufb m6, m12 + pshufb m8, m12 + pshufb m1, m12 + shufps m7, m6, m0, q1032 ; out14 -out15 + vpblendd m0, m6, 0x33 ; -out1 out0 + punpcklqdq m6, m8, m1 ; out12 -out13 + punpckhqdq m1, m8, m1 ; -out3 out2 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + pmaddwd m9, m8, m11 ; -out11 + pmaddwd m2, m12, m5 ; -out5 + pmaddwd m5, m8 ; out10 + pmaddwd m11, m12 ; out4 + REPX {paddd x, m10}, m9, m5, m2, m11 + REPX {psrad x, 12 }, m9, m5, m2, m11 + packssdw m5, m9 ; out10 -out11 + packssdw m2, m11 ; -out5 out4 + pmaddwd m11, m8, m3 ; out8 + vpbroadcastd m8, [o(pw_2896_m2896)] + pmaddwd m3, m12 ; -out7 + pmaddwd m8, m4 ; -out9 + pmaddwd m4, m12 ; out6 + REPX {paddd x, m10}, m11, m3, m8, m4 + REPX {psrad x, 12 }, m11, m3, m8, m4 + packssdw m3, m4 ; -out7 out6 + packssdw m4, m11, m8 ; out8 -out9 + vpbroadcastd m10, [o(pw_16384)] + pxor m9, m9 + ret +ALIGN function_align +cglobal_label .main_pass2_end + vpbroadcastd m8, [o(pw_2896x8)] + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + punpcklqdq m11, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + vpblendd m3, m4, 0xcc ; t3a t11 + psubsw m4, m2, m3 ; out8 -out9 + paddsw m3, m2 ; -out7 out6 + paddsw m2, m5, m11 ; -out5 out4 + psubsw m5, m11 ; out10 -out11 + REPX {pmulhrsw x, m8}, m2, m3, m4, m5 + ret + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + vpbroadcastd m9, [o(pw_16384)] + pslld m10, m9, 17 + psubw m10, m9 ; -16384, 16384 + vperm2i128 m9, m4, m0, 0x31 + vinserti128 m0, m4, xm0, 1 + vperm2i128 m8, m5, m1, 0x31 + vinserti128 m4, m5, xm1, 1 + vperm2i128 m5, m7, m3, 0x31 + vinserti128 m3, m7, xm3, 1 + vinserti128 m1, m6, xm2, 1 + vperm2i128 m6, m6, m2, 0x31 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m3, m1 + punpckhwd m3, m1 + jmp m(idct_8x16_internal_8bpc).pass1_end2 +.pass2: + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end + vpbroadcastd m8, [o(pw_2048)] + vpbroadcastd xm9, [o(pw_4096)] + psubw m8, m9 + vpermq m9, m0, q3120 + vpermq m0, m7, q2031 + vpermq m7, m1, q3120 + vpermq m1, m6, q2031 + vpermq m6, m2, q3120 + vpermq m2, m5, q2031 + vpermq m5, m3, q3120 + vpermq m3, m4, q2031 + pmulhrsw m0, m8 + pmulhrsw m1, m8 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + pmulhrsw m4, m5, m8 + pmulhrsw m5, m6, m8 + pmulhrsw m6, m7, m8 + pmulhrsw m7, m9, m8 + jmp m(idct_8x16_internal_8bpc).end3 + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*2] + add cq, 16*8 + vinserti128 m3, [cq+16*0], 1 + vinserti128 m2, [cq+16*2], 1 + vpbroadcastd m9, [o(pw_2896x8)] + mova xm4, [cq-16*4] + mova xm5, [cq-16*2] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*6], 1 + mova xm7, [cq-16*7] + mova xm6, [cq-16*5] + vinserti128 m7, [cq+16*1], 1 + vinserti128 m6, [cq+16*3], 1 + mova xm8, [cq-16*3] + mova xm0, [cq-16*1] + vinserti128 m8, [cq+16*5], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + punpcklwd m5, m7, m6 + punpckhwd m7, m6 + punpcklwd m6, m8, m0 + punpckhwd m8, m0 + REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(idct_8x16_internal_8bpc).end + +%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] + pmovzxbw m%3, [dstq+%5] +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif + pmovzxbw m%4, [dstq+%6] +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vpermq m%3, m%3, q3120 + mova [dstq+%5], xm%3 + vextracti128 [dstq+%6], m%3, 1 +%endmacro + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + or r3d, 4 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] + mova xm4, [cq+16*4] + mova xm5, [cq+16*5] + mova xm6, [cq+16*6] + mova xm7, [cq+16*7] + call m(idct_4x16_internal_8bpc).main + vinserti128 m6, m2, xm6, 1 + vinserti128 m2, m0, xm4, 1 + vinserti128 m0, m1, xm5, 1 + vinserti128 m1, m3, xm7, 1 + punpcklwd m3, m2, m6 + punpckhwd m2, m6 + vpbroadcastd m6, [o(pw_16384)] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + mova m1, m6 + jmp m(iadst_16x4_internal_8bpc).pass1_end +.pass2: + call .main + jmp m(iadst_16x4_internal_8bpc).end +ALIGN function_align +cglobal_label .main + vpbroadcastd m6, [o(pd_2048)] + IDCT4_1D 0, 1, 2, 3, 4, 5, 6 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal_8bpc).main2 + call m(iadst_4x16_internal_8bpc).main_pass1_end + punpcklwd m4, m3, m1 + punpcklwd m5, m2, m0 + punpckhwd m0, m1 + punpckhwd m2, m3 + vpbroadcastd m1, [o(pw_16384)] + vinserti128 m3, m0, xm2, 1 + vperm2i128 m2, m0, m2, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m6, m7, m1 +.pass1_end: + pmulhrsw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m4, m1 + pmulhrsw m0, m6 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m4, [o(pw_2048)] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + WIN64_RESTORE_XMM +.end2: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m6, [o(pw_m3344_3344)] + vpbroadcastd m7, [o(pw_3803_1321)] + vpbroadcastd m8, [o(pw_m1321_2482)] + vpbroadcastd m9, [o(pw_2482_3344)] + punpcklwd m4, m2, m0 ; in2 in0 l + punpckhwd m2, m0 ; in2 in0 h + psrld m5, m6, 16 + pmaddwd m10, m6, m4 ; t2:02 l + pmaddwd m6, m2 ; t2:02 h + pmaddwd m0, m7, m4 ; t0:02 l + pmaddwd m7, m2 ; t0:02 h + pmaddwd m4, m8 ; t1:02 l + pmaddwd m8, m2 ; t1:02 h + punpckhwd m2, m3, m1 ; in3 in1 h + punpcklwd m3, m1 ; in3 in1 l + pmaddwd m1, m5, m2 ; t2:3 h + pmaddwd m5, m3 ; t2:3 l + paddd m6, m1 + vpbroadcastd m1, [o(pd_2048)] + paddd m10, m5 + pmaddwd m5, m9, m3 + pmaddwd m9, m2 + paddd m0, m1 + paddd m7, m1 + paddd m0, m5 ; t0 + t3 + 2048 l + paddd m7, m9 ; t0 + t3 + 2048 h + vpbroadcastd m9, [o(pw_m3803_3344)] + pmaddwd m5, m9, m2 + pmaddwd m9, m3 + paddd m10, m1 ; t2 + 2048 l + paddd m6, m1 ; t2 + 2048 h + paddd m5, m1 ; t1:13 + 2048 h + paddd m1, m9 ; t1:13 + 2048 l + vpbroadcastd m9, [o(pw_m3803_m6688)] + pmaddwd m2, m9 + pmaddwd m3, m9 + paddd m5, m8 ; t1 + t3 + 2048 h + paddd m1, m4 ; t1 + t3 + 2048 l + paddd m8, m7 + paddd m4, m0 + paddd m2, m8 ; t0 + t1 - t3 + 2048 h + paddd m3, m4 ; t0 + t1 - t3 + 2048 l + REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3 + packssdw m0, m7 + packssdw m1, m5 + packssdw m3, m2 + packssdw m2, m10, m6 + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal_8bpc).main2 + call m(iadst_4x16_internal_8bpc).main_pass1_end + punpckhwd m4, m3, m2 + punpckhwd m5, m1, m0 + punpcklwd m0, m2 + punpcklwd m1, m3 + vpbroadcastd m6, [o(pw_16384)] + vinserti128 m3, m0, xm1, 1 + vperm2i128 m2, m0, m1, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m1, m7, m6 + jmp m(iadst_16x4_internal_8bpc).pass1_end +ALIGN function_align +.pass2: + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m4, [o(pw_2048)] + REPX {pmulhrsw x, m4}, m3, m2, m1, m0 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 + RET + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm4, [cq+16*1] + vinserti128 m2, [cq+16*4], 1 + vinserti128 m4, [cq+16*5], 1 + mova xm0, [cq+16*2] + mova xm1, [cq+16*3] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m7, [o(pw_1697x16)] + vpbroadcastd m8, [o(pw_16384)] + punpcklwd m3, m2, m4 + punpckhwd m2, m4 + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + pmulhrsw m0, m7, m1 + pmulhrsw m5, m7, m2 + pmulhrsw m6, m7, m3 + pmulhrsw m7, m4 + REPX {pmulhrsw x, m8}, m0, m5, m6, m7 + paddsw m1, m0 + paddsw m2, m5 + paddsw m3, m6 + paddsw m4, m7 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_16x4_internal_8bpc).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 8 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +%endif +%endmacro + +%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd + vpbroadcastd m8, [o(pw_2896x8)] + vpermq m0, [cq+32*0], q3120 + add cq, 32*4 + vpermq m7, [cq+32*3], q%1 + vpermq m1, [cq-32*3], q%1 + vpermq m6, [cq+32*2], q3120 + vpermq m2, [cq-32*2], q3120 + vpermq m5, [cq+32*1], q%1 + vpermq m3, [cq-32*1], q%1 + vpermq m4, [cq+32*0], q3120 + REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, identity + +cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 3120 + call m(idct_8x16_internal_8bpc).main + vpbroadcastd m10, [o(pw_16384)] + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + REPX {pmulhrsw x, m10}, m8, m1, m4, m6 +.pass1_end: + REPX {pmulhrsw x, m10}, m0, m2, m9, m5 + punpckhwd m3, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m9, m4 + punpckhwd m9, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m8 + punpckhdq m3, m8 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m8, m9, m5 + punpckhdq m9, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m8, 0x31 + vinserti128 m2, xm8, 1 + vperm2i128 m7, m3, m9, 0x31 + vinserti128 m3, xm9, 1 + jmp tx2q +.pass2: + call .main + vpbroadcastd m8, [o(pw_2048)] +.end: + REPX {pmulhrsw x, m8}, m0, m2, m4, m6 +.end2: + REPX {pmulhrsw x, m8}, m1, m3, m5, m7 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 +.end3: + pxor m0, m0 + REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 +.end4: + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(pd_2048)] +.main2: + IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal_8bpc).main2 + call m(iadst_8x16_internal_8bpc).main_pass1_end + psubw m11, m9, m10 + punpcklwd m8, m0, m2 + punpckhwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpckhwd m6, m5, m7 + punpcklwd m5, m7 + REPX {pmulhrsw x, m11}, m8, m1, m4, m6 + jmp m(idct_16x8_internal_8bpc).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + pxor m8, m8 + psubw m8, m9 + REPX {pmulhrsw x, m9}, m0, m2, m4, m6 + jmp m(idct_16x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(pd_2048)] + ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a + ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a + ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a + psubsw m8, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m0, m4 ; t4 + paddsw m0, m4 ; t0 + psubsw m4, m5, m1 ; t7 + paddsw m5, m1 ; t3 + psubsw m1, m7, m3 ; t5 + paddsw m7, m3 ; t1 + ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a + psubsw m9, m6, m8 ; t7 + paddsw m6, m8 ; out6 + psubsw m3, m7, m5 ; t3 + paddsw m7, m5 ; -out7 + psubsw m5, m0, m2 ; t2 + paddsw m0, m2 ; out0 + psubsw m2, m1, m4 ; t6 + paddsw m1, m4 ; -out1 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + punpckhwd m4, m3, m5 + punpcklwd m3, m5 + pmaddwd m5, m11, m4 + pmaddwd m4, m12 + pmaddwd m8, m11, m3 + pmaddwd m3, m12 + REPX {paddd x, m10}, m5, m4, m8, m3 + REPX {psrad x, 12 }, m5, m8, m4, m3 + packssdw m3, m4 ; -out3 + packssdw m4, m8, m5 ; out4 + punpcklwd m5, m9, m2 + punpckhwd m9, m2 + pmaddwd m2, m12, m5 + pmaddwd m5, m11 + pmaddwd m12, m9 + pmaddwd m11, m9 + REPX {paddd x, m10}, m2, m5, m12, m11 + REPX {psrad x, 12 }, m2, m12, m5, m11 + packssdw m2, m12 ; out2 + packssdw m5, m11 ; -out5 + ret +ALIGN function_align +cglobal_label .main_pass2_end + vpbroadcastd m8, [o(pw_2896x8)] + psubsw m4, m5, m3 + paddsw m3, m5 + psubsw m5, m2, m9 + paddsw m2, m9 + pmulhrsw m2, m8 ; out2 + pmulhrsw m3, m8 ; -out3 + pmulhrsw m4, m8 ; out4 + pmulhrsw m5, m8 ; -out5 + vpbroadcastd m9, [o(pw_2048)] + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal_8bpc).main2 + call m(iadst_8x16_internal_8bpc).main_pass1_end + psubw m9, m10 + punpcklwd m8, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m7, m5 + punpckhwd m7, m5 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m2, m0 + punpcklwd m2, m0 + REPX {pmulhrsw x, m10}, m8, m4, m5, m1 + REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 + punpcklwd m0, m7, m4 + punpckhwd m7, m4 + punpckhwd m4, m6, m8 + punpcklwd m6, m8 + punpckhwd m8, m3, m5 + punpcklwd m3, m5 + punpcklwd m5, m2, m1 + punpckhwd m2, m1 + punpckhdq m1, m0, m6 + punpckldq m0, m6 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckhdq m4, m3, m5 + punpckldq m3, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m4, 0x31 + vinserti128 m1, xm4, 1 + vperm2i128 m4, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vinserti128 m3, m7, xm8, 1 + vperm2i128 m7, m8, 0x31 + jmp tx2q +.pass2: + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + pxor m8, m8 + psubw m8, m9 + pmulhrsw m10, m7, m8 + pmulhrsw m7, m0, m9 + pmulhrsw m0, m6, m9 + pmulhrsw m6, m1, m8 + pmulhrsw m1, m5, m8 + pmulhrsw m5, m2, m9 + pmulhrsw m2, m4, m9 + pmulhrsw m4, m3, m8 + lea r3, [strideq*3] + WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 + WRITE_16X2 1, 2, 0, 1, strideq*2, r3 + jmp m(idct_16x8_internal_8bpc).end3 + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm7, [cq+16*0] + mova xm2, [cq+16*1] + add cq, 16*8 + vpbroadcastd m3, [o(pw_2896x8)] + vinserti128 m7, [cq+16*0], 1 + vinserti128 m2, [cq+16*1], 1 + mova xm6, [cq-16*6] + mova xm4, [cq-16*5] + vinserti128 m6, [cq+16*2], 1 + vinserti128 m4, [cq+16*3], 1 + mova xm8, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m8, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm0, [cq-16*2] + mova xm1, [cq-16*1] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m11, [o(pw_16384)] + REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 + punpcklwd m3, m7, m2 + punpckhwd m7, m2 + punpcklwd m2, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m8, m5 + punpckhwd m8, m5 + punpcklwd m5, m0, m1 + punpckhwd m0, m1 + punpckldq m1, m3, m2 + punpckhdq m3, m2 + punpckldq m2, m4, m5 + punpckhdq m4, m5 + punpckldq m5, m7, m6 + punpckhdq m7, m6 + punpckldq m6, m8, m0 + punpckhdq m8, m0 + REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m6 + punpckhqdq m5, m6 + punpcklqdq m6, m7, m8 + punpckhqdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_4096)] + jmp m(idct_16x8_internal_8bpc).end + +%define o_base pw_5 + 128 + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 16 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +%endif +%endmacro + +%macro ITX_16X16_LOAD_COEFS 0 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + add cq, 32*8 + mova m4, [cq-32*4] + mova m5, [cq-32*3] + mova m6, [cq-32*2] + mova m7, [cq-32*1] + mova m8, [cq+32*0] + mova m9, [cq+32*1] + mova m10, [cq+32*2] + mova m11, [cq+32*3] + mova m12, [cq+32*4] + mova m13, [cq+32*5] + mova m14, [cq+32*6] + mova m15, [cq+32*7] + mova [rsp], m15 +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, identity + +cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main +.pass1_end: + vpbroadcastd m1, [o(pw_8192)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 +.pass1_end2: + vextracti128 [rsp+16*4], m0, 1 + mova [rsp+16*0], xm0 + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + vperm2i128 m8, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vperm2i128 m9, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vperm2i128 m10, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + vperm2i128 m11, m4, m12, 0x31 + vinserti128 m4, xm12, 1 + vperm2i128 m12, m5, m13, 0x31 + vinserti128 m5, xm13, 1 + vperm2i128 m13, m6, m14, 0x31 + vinserti128 m6, xm14, 1 + vperm2i128 m14, m7, m15, 0x31 + vinserti128 m7, xm15, 1 + mova m15, [rsp+32*2] +.pass1_end3: + punpcklwd m0, m9, m10 + punpckhwd m9, m10 + punpcklwd m10, m15, m8 + punpckhwd m15, m8 + punpckhwd m8, m11, m12 + punpcklwd m11, m12 + punpckhwd m12, m13, m14 + punpcklwd m13, m14 + punpckhdq m14, m11, m13 + punpckldq m11, m13 + punpckldq m13, m15, m9 + punpckhdq m15, m9 + punpckldq m9, m10, m0 + punpckhdq m10, m0 + punpckhdq m0, m8, m12 + punpckldq m8, m12 + punpcklqdq m12, m13, m8 + punpckhqdq m13, m8 + punpcklqdq m8, m9, m11 + punpckhqdq m9, m11 + punpckhqdq m11, m10, m14 + punpcklqdq m10, m14 + punpcklqdq m14, m15, m0 + punpckhqdq m15, m0 + mova m0, [rsp] + mova [rsp], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m1, [o(pw_2048)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp], m6 +.end2: + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + lea r3, [strideq*3] + WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 +.end3: + pxor m2, m2 + REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 + lea dstq, [dstq+strideq*4] + WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 + WRITE_16X2 10, 11, 0, 1, strideq*2, r3 + REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 + lea dstq, [dstq+strideq*4] + WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 + WRITE_16X2 14, 15, 0, 1, strideq*2, r3 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m15, [o(pd_2048)] + mova [rsp+gprsize+32*1], m1 + mova [rsp+gprsize+32*2], m9 + IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 + mova m1, [rsp+gprsize+32*2] ; in9 + mova [rsp+gprsize+32*2], m14 ; tmp7 + mova m9, [rsp+gprsize+32*1] ; in1 + mova [rsp+gprsize+32*1], m10 ; tmp5 + mova m14, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m6 ; tmp3 + IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 + mova m6, [rsp+gprsize+32*1] ; tmp5 + psubsw m15, m0, m14 ; out15 + paddsw m0, m14 ; out0 + psubsw m14, m2, m13 ; out14 + paddsw m2, m13 ; out1 + mova [rsp+gprsize+32*1], m2 + psubsw m13, m4, m11 ; out13 + paddsw m2, m4, m11 ; out2 + psubsw m11, m8, m7 ; out11 + paddsw m4, m8, m7 ; out4 + mova m7, [rsp+gprsize+32*2] ; tmp7 + psubsw m10, m6, m5 ; out10 + paddsw m5, m6 ; out5 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; out7 + psubsw m9, m12, m3 ; out9 + paddsw m6, m12, m3 ; out6 + mova m3, [rsp+gprsize+32*0] ; tmp3 + psubsw m12, m3, m1 ; out12 + paddsw m3, m1 ; out3 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main + call .main_pass1_end + pmulhrsw m0, m1, [cq+32*0] + pmulhrsw m2, m1, [cq+32*1] + REPX {pmulhrsw x, m1}, m4, m6, m8, m10 + pmulhrsw m12, m1, [cq+32*2] + pmulhrsw m14, m1, [cq+32*3] + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 + pxor m8, m8 + psubw m1, m8, m1 + jmp m(idct_16x16_internal_8bpc).pass1_end2 +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp+32*0], m6 + pxor m6, m6 + psubw m1, m6, m1 + jmp m(idct_16x16_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + vpbroadcastd m15, [o(pd_2048)] + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*2], m4 + ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2 + ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6 + ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10 + ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14 + psubsw m0, m2, m10 ; t10a + paddsw m2, m10 ; t2a + psubsw m10, m13, m5 ; t11a + paddsw m13, m5 ; t3a + psubsw m5, m6, m14 ; t14a + paddsw m6, m14 ; t6a + psubsw m14, m9, m1 ; t15a + paddsw m9, m1 ; t7a + ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10 + ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15 + psubsw m1, m10, m14 ; t14a + paddsw m10, m14 ; t10a + psubsw m14, m0, m5 ; t15a + paddsw m0, m5 ; t11a + psubsw m5, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m13, m9 ; t7 + paddsw m13, m9 ; t3 + ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a + ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15 + mova m9, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m10 ; t10a + mova m4, [rsp+gprsize+32*1] ; in0 + mova [rsp+gprsize+32*1], m6 ; t6a + mova m6, [rsp+gprsize+32*2] ; in4 + mova [rsp+gprsize+32*2], m2 ; t2 + ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0 + ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4 + ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8 + ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12 + psubsw m10, m4, m8 ; t8a + paddsw m8, m4 ; t0a + psubsw m4, m9, m7 ; t9a + paddsw m9, m7 ; t1a + psubsw m7, m6, m12 ; t12a + paddsw m6, m12 ; t4a + psubsw m12, m11, m3 ; t13a + paddsw m11, m3 ; t5a + ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8 + ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13 + psubsw m3, m9, m11 ; t5 + paddsw m9, m11 ; t1 + psubsw m11, m4, m12 ; t12a + paddsw m4, m12 ; t8a + paddsw m12, m8, m6 ; t0 + psubsw m8, m6 ; t4 + paddsw m6, m10, m7 ; t9a + psubsw m10, m7 ; t13a + ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12 + mova m7, [rsp+gprsize+32*0] ; t10a + mova m2, [rsp+gprsize+32*1] ; t6a + paddsw m15, m9, m13 ; -out15 + psubsw m9, m13 ; t3a + paddsw m13, m11, m1 ; -out13 + psubsw m11, m1 ; t15a + psubsw m1, m4, m7 ; t10 + paddsw m7, m4 ; -out1 + psubsw m4, m3, m2 ; t6 + paddsw m3, m2 ; -out3 + paddsw m2, m10, m14 ; out2 + psubsw m10, m14 ; t14a + paddsw m14, m6, m0 ; out14 + psubsw m6, m0 ; t11 + mova m0, [rsp+gprsize+32*2] ; t2 + mova [rsp+gprsize+32*1], m7 + psubsw m7, m12, m0 ; t2a + paddsw m0, m12 ; out0 + paddsw m12, m8, m5 ; out12 + psubsw m8, m5 ; t7 + ret +ALIGN function_align +.main_pass1_end: + mova [cq+32*0], m0 + mova [cq+32*1], m2 + mova [cq+32*2], m12 + mova [cq+32*3], m14 + vpbroadcastd m14, [pw_m2896_2896] + vpbroadcastd m12, [pw_2896_2896] + vpbroadcastd m2, [pd_2048] + punpcklwd m5, m11, m10 + punpckhwd m11, m10 + pmaddwd m10, m14, m5 + pmaddwd m0, m14, m11 + pmaddwd m5, m12 + pmaddwd m11, m12 + REPX {paddd x, m2}, m10, m0, m5, m11 + REPX {psrad x, 12}, m10, m0, m5, m11 + packssdw m10, m0 ; out10 + packssdw m5, m11 ; -out5 + punpcklwd m11, m8, m4 + punpckhwd m8, m4 + pmaddwd m4, m12, m11 + pmaddwd m0, m12, m8 + pmaddwd m11, m14 + pmaddwd m8, m14 + REPX {paddd x, m2}, m4, m0, m11, m8 + REPX {psrad x, 12}, m4, m0, m11, m8 + packssdw m4, m0 ; out4 + packssdw m11, m8 ; -out11 + punpcklwd m8, m9, m7 + punpckhwd m9, m7 + pmaddwd m7, m12, m8 + pmaddwd m0, m12, m9 + pmaddwd m8, m14 + pmaddwd m9, m14 + REPX {paddd x, m2}, m7, m0, m8, m9 + REPX {psrad x, 12}, m7, m0, m8, m9 + packssdw m7, m0 ; -out7 + packssdw m8, m9 ; out8 + punpckhwd m0, m6, m1 + punpcklwd m6, m1 + pmaddwd m1, m14, m0 + pmaddwd m9, m14, m6 + pmaddwd m0, m12 + pmaddwd m6, m12 + REPX {paddd x, m2}, m1, m9, m0, m6 + REPX {psrad x, 12}, m1, m9, m0, m6 + packssdw m9, m1 ; -out7 + packssdw m6, m0 ; out8 + vpbroadcastd m1, [o(pw_8192)] + ret +ALIGN function_align +cglobal_label .main_pass2_end + ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to + ; 16-bit here will produce the same result as using 32-bit intermediates. + paddsw m5, m10, m11 ; -out5 + psubsw m10, m11 ; out10 + psubsw m11, m4, m8 ; -out11 + paddsw m4, m8 ; out4 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; -out7 + psubsw m9, m1, m6 ; -out9 + paddsw m6, m1 ; out6 + vpbroadcastd m1, [o(pw_2896x8)] + REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 + vpbroadcastd m1, [o(pw_2048)] + ret + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass1_end + pmulhrsw m6, m1 + pmulhrsw m2, m1, m8 + mova [rsp+32*2], m6 + pmulhrsw m6, m1, m4 + pmulhrsw m4, m1, m10 + pmulhrsw m8, m1, [cq+32*3] + pmulhrsw m10, m1, [cq+32*2] + pmulhrsw m12, m1, [cq+32*1] + pmulhrsw m14, m1, [cq+32*0] + pxor m0, m0 + psubw m0, m1 + REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 + pmulhrsw m1, m0, m9 + pmulhrsw m9, m0, m13 + pmulhrsw m0, [rsp+32*1] + mova [rsp+16*0], xm15 + mova [rsp+16*1], xm7 + vperm2i128 m15, m15, m7, 0x31 + vinserti128 m7, m2, xm14, 1 + vperm2i128 m14, m2, m14, 0x31 + vinserti128 m2, m9, xm5, 1 + vperm2i128 m9, m9, m5, 0x31 + vinserti128 m5, m4, xm12, 1 + vperm2i128 m12, m4, m12, 0x31 + vinserti128 m4, m11, xm3, 1 + vperm2i128 m11, m11, m3, 0x31 + vinserti128 m3, m10, xm6, 1 + vperm2i128 m10, m10, m6, 0x31 + vinserti128 m6, m1, xm0, 1 + vperm2i128 m13, m1, m0, 0x31 + vinserti128 m1, m8, [rsp+32*2], 1 + vperm2i128 m8, m8, [rsp+32*2], 0x31 + jmp m(idct_16x16_internal_8bpc).pass1_end3 +.pass2: + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end + pmulhrsw m0, m1 + pmulhrsw m8, m1 + mova [rsp+32*0], m0 + mova [rsp+32*2], m8 + pxor m0, m0 + psubw m0, m1 + pmulhrsw m8, m0, m7 + pmulhrsw m7, m0, m9 + pmulhrsw m9, m1, m6 + pmulhrsw m6, m1, m10 + pmulhrsw m10, m0, m5 + pmulhrsw m5, m0, m11 + pmulhrsw m11, m1, m4 + pmulhrsw m4, m1, m12 + pmulhrsw m12, m0, m3 + pmulhrsw m3, m0, m13 + pmulhrsw m13, m1, m2 + pmulhrsw m1, m14 + pmulhrsw m14, m0, [rsp+32*1] + pmulhrsw m0, m15 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 + mova m15, [rsp+32*0] + WRITE_16X2 3, 4, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 + WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 + jmp m(idct_16x16_internal_8bpc).end3 + +%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 + pmulhrsw m%2, m%3, m%1 + psraw m%2, 1 + pavgw m%1, m%2 ; signs are guaranteed to be equal +%endmacro + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + vpbroadcastd m7, [o(pw_1697x16)] + mova xm0, [cq+16* 0] + vinserti128 m0, [cq+16*16], 1 + mova xm15, [cq+16* 1] + vinserti128 m15, [cq+16*17], 1 + mova xm1, [cq+16* 2] + vinserti128 m1, [cq+16*18], 1 + mova xm8, [cq+16* 3] + vinserti128 m8, [cq+16*19], 1 + mova xm2, [cq+16* 4] + vinserti128 m2, [cq+16*20], 1 + mova xm9, [cq+16* 5] + vinserti128 m9, [cq+16*21], 1 + mova xm3, [cq+16* 6] + vinserti128 m3, [cq+16*22], 1 + mova xm10, [cq+16* 7] + add cq, 16*16 + vinserti128 m10, [cq+16* 7], 1 + mova xm4, [cq-16* 8] + vinserti128 m4, [cq+16* 8], 1 + mova xm11, [cq-16* 7] + vinserti128 m11, [cq+16* 9], 1 + mova xm5, [cq-16* 6] + vinserti128 m5, [cq+16*10], 1 + mova xm12, [cq-16* 5] + vinserti128 m12, [cq+16*11], 1 + mova xm13, [cq-16* 3] + vinserti128 m13, [cq+16*13], 1 + mova xm14, [cq-16* 1] + vinserti128 m14, [cq+16*15], 1 + REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ + 10, 4, 11, 5, 12, 13, 14 + mova xm6, [cq-16* 4] + vinserti128 m6, [cq+16*12], 1 + mova [rsp], m0 + IDTX16B 6, 0, 7 + mova xm0, [cq-16* 2] + vinserti128 m0, [cq+16*14], 1 + pmulhrsw m7, m0 + psraw m7, 1 + pavgw m7, m0 + jmp m(idct_16x16_internal_8bpc).pass1_end3 +ALIGN function_align +.pass2: + vpbroadcastd m15, [o(pw_1697x16)] + mova [rsp+32*1], m0 + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [rsp+32*1] + mova [rsp+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [rsp+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + jmp m(idct_16x16_internal_8bpc).end + +%define o_base deint_shuf + 128 + +%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 +%if %3 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [%1+%2*0] + pmulhrsw m1, m15, [%1+%2*1] + pmulhrsw m2, m15, [%1+%2*2] + pmulhrsw m3, m15, [%1+%2*3] + pmulhrsw m4, m15, [%1+%2*4] + pmulhrsw m5, m15, [%1+%2*5] + pmulhrsw m6, m15, [%1+%2*6] + pmulhrsw m7, m15, [%1+%2*7] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] + mova m7, [%1+%2*7] +%endif +%endmacro + +%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 +%if %3 +%if %3 == 1 + vpbroadcastd m15, [o(pw_2896x8)] +%endif + pmulhrsw m8, m15, [%1+%2*0] + pmulhrsw m9, m15, [%1+%2*1] + pmulhrsw m10, m15, [%1+%2*2] + pmulhrsw m11, m15, [%1+%2*3] + pmulhrsw m12, m15, [%1+%2*4] + pmulhrsw m13, m15, [%1+%2*5] + pmulhrsw m14, m15, [%1+%2*6] + pmulhrsw m15, [%1+%2*7] +%else + mova m8, [%1+%2*0] + mova m9, [%1+%2*1] + mova m10, [%1+%2*2] + mova m11, [%1+%2*3] + mova m12, [%1+%2*4] + mova m13, [%1+%2*5] + mova m14, [%1+%2*6] + mova m15, [%1+%2*7] +%endif +%endmacro + +%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] + vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8] + punpcklwd m%1, m%2, m%2 + pmulhrsw m%1, m%3 + vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8] + punpckhwd m%2, m%2 + pmulhrsw m%2, m%3 +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + cmp eobd, 106 + jle .fast + LOAD_8ROWS cq+32*1, 32*2 + call m(idct_16x8_internal_8bpc).main + vperm2i128 m11, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m11, m4 + punpckhwd m11, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 + punpckhdq m5, m11, m4 + punpckldq m11, m4 + punpckldq m4, m7, m1 + punpckhdq m7, m1 + punpckhqdq m12, m6, m0 + punpcklqdq m0, m6 ; out4 + punpckhqdq m13, m7, m4 + punpcklqdq m4, m7 ; out5 + punpckhqdq m14, m3, m2 + punpcklqdq m2, m3 ; out6 + punpckhqdq m15, m5, m11 + punpcklqdq m11, m5 ; out7 + mova [rsp+32*0], m0 + mova [rsp+32*1], m4 + mova [rsp+32*2], m2 +.fast: + LOAD_8ROWS cq+32*0, 32*2 + call m(idct_16x8_internal_8bpc).main + vperm2i128 m8, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vpbroadcastd m9, [o(pw_8192)] + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m8, m4 + punpcklwd m8, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m8, m5 + punpckhdq m8, m5 + punpckhdq m5, m3, m4 + punpckldq m3, m4 + punpckhdq m4, m7, m1 + punpckldq m7, m1 + punpcklqdq m1, m7, m4 + punpckhqdq m7, m4 ; out9 + punpckhqdq m4, m2, m8 ; out10 + punpcklqdq m2, m8 + punpckhqdq m8, m3, m5 + punpcklqdq m3, m5 + punpckhqdq m5, m0, m6 ; out8 + punpcklqdq m0, m6 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 + cmp eobd, 106 + jg .full + mova [rsp+32*0], m5 + mova [rsp+32*1], m7 + mova [rsp+32*2], m4 + pmulhrsw m11, m9, m8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call .main_fast + jmp .pass2 +.dconly: + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly +.full: + REPX {pmulhrsw x, m9}, m12, m13, m14, m15 + pmulhrsw m6, m9, [rsp+32*2] + mova [rsp+32*2], m4 + pmulhrsw m4, m9, [rsp+32*0] + mova [rsp+32*0], m5 + pmulhrsw m5, m9, [rsp+32*1] + mova [rsp+32*1], m7 + pmulhrsw m7, m9, m11 + pmulhrsw m11, m9, m8 + call .main +.pass2: + vpbroadcastd m12, [o(pw_2048)] + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m13, m14, m15 + pmulhrsw m12, [rsp] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 + mova [rsp+32*0], m4 + mova [rsp+32*1], m6 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*0], 5, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*1], 7, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 8, 9, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 10, 11, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 12, 13, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 14, 15, 4, 6 + RET +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + call m(idct_8x16_internal_8bpc).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + lea r5, [r6-(o_base)+pw_201_4091x8] + ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a + jmp .main2 +ALIGN function_align +cglobal_label .main + call m(idct_8x16_internal_8bpc).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + punpcklwd m1, m15, m8 ; in31 in1 + punpckhwd m8, m15 ; in3 in29 + punpcklwd m15, m14, m9 ; in27 in5 + punpckhwd m9, m14 ; in7 in25 + punpcklwd m14, m13, m0 ; in23 in9 + punpckhwd m0, m13 ; in11 in21 + punpcklwd m13, m12, m11 ; in19 in13 + punpckhwd m11, m12 ; in15 in17 + ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a + ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a + ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a + ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a + ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a + ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a + ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a + ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a +.main2: + psubsw m6, m1, m11 ; t17 t30 + paddsw m1, m11 ; t16 t31 + psubsw m11, m9, m14 ; t18 t29 + paddsw m9, m14 ; t19 t28 + psubsw m14, m15, m0 ; t21 t26 + paddsw m15, m0 ; t20 t27 + psubsw m0, m8, m13 ; t22 t25 + paddsw m8, m13 ; t23 t24 + ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a + ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a + ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a + ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a + psubsw m13, m1, m9 ; t19a t28a + paddsw m1, m9 ; t16a t31a + psubsw m9, m8, m15 ; t20a t27a + paddsw m8, m15 ; t23a t24a + psubsw m15, m6, m11 ; t18 t29 + paddsw m6, m11 ; t17 t30 + psubsw m11, m0, m14 ; t21 t26 + paddsw m0, m14 ; t22 t25 + ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a + ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28 + ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27 + ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a + vbroadcasti128 m12, [o(deint_shuf)] + psubsw m14, m1, m8 ; t23 t24 + paddsw m1, m8 ; t16 t31 + psubsw m8, m6, m0 ; t22a t25a + paddsw m6, m0 ; t17a t30a + psubsw m0, m15, m11 ; t21 t26 + paddsw m15, m11 ; t18 t29 + psubsw m11, m13, m9 ; t20a t27a + paddsw m13, m9 ; t19a t28a + REPX {pshufb x, m12}, m1, m6, m15, m13 + ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a + vpbroadcastd m9, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 + vpbroadcastd m12, [o(pw_2896_2896)] + ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a + vpbroadcastd m12, [o(pw_2896_2896)] + ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 + shufps m9, m14, m8, q1032 ; t23a t22 + vpblendd m14, m8, 0xcc ; t24a t25 + shufps m8, m11, m0, q1032 ; t20 t21a + vpblendd m11, m0, 0xcc ; t27 t26a + punpcklqdq m0, m1, m6 ; t16 t17a + punpckhqdq m1, m6 ; t31 t30a + psubsw m10, m5, m8 ; out20 out21 + paddsw m5, m8 ; out11 out10 + psubsw m6, m3, m14 ; out24 out25 + paddsw m3, m14 ; out7 out6 + psubsw m8, m7, m0 ; out16 out17 + paddsw m7, m0 ; out15 out14 + mova m0, [rsp+gprsize+0*32] + punpcklqdq m12, m13, m15 ; t19a t18 + punpckhqdq m13, m15 ; t28a t29 + psubsw m15, m0, m1 ; out31 out30 + paddsw m0, m1 ; out0 out1 + mova m1, [rsp+gprsize+1*32] + mova [rsp+gprsize+0*32], m6 + mova m6, [rsp+gprsize+2*32] + psubsw m14, m1, m13 ; out28 out29 + paddsw m1, m13 ; out3 out2 + psubsw m13, m2, m11 ; out27 out26 + paddsw m2, m11 ; out4 out5 + psubsw m11, m4, m9 ; out23 out22 + paddsw m4, m9 ; out8 out9 + psubsw m9, m6, m12 ; out19 out18 + paddsw m6, m12 ; out12 out13 + ret + +%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] + vbroadcasti128 m%1, [cq+16*%3] + vbroadcasti128 m%2, [cq+16*%4] + shufpd m%1, m%2, 0x0c +%endmacro + +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 8 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova m1, [dstq] + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq], m1 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 + LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 + LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 + LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*16 + LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 + LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 + LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 + LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 + mova [rsp+32*0], m4 + mova [rsp+32*1], m5 + mova [rsp+32*2], m6 + cmp eobd, 106 + jg .full + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast + jmp .pass2 +.full: + LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 + LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 + LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 + LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*8 + LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 + LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 + LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 + LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main +.pass2: + vpbroadcastd m12, [o(pw_8192)] + REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 + mova [rsp+32*1], m9 + mova [rsp+32*2], m10 + punpckhwd m9, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m10, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + punpckhwd m3, m0, m9 + punpcklwd m0, m9 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m10, m4 + punpckhwd m10, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m10, m5 + punpckhdq m10, m5 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 + pmulhrsw m12, [rsp+32*0] + mova [rsp+32*0], m8 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m10, 0x31 + vinserti128 m3, xm10, 1 + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + lea r2, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + mova m0, [rsp+32*0] + mova m1, [rsp+32*1] + mova m2, [rsp+32*2] + punpckhwd m7, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m11 + punpcklwd m1, m11 + punpckhwd m4, m12, m14 + punpcklwd m12, m14 + punpckhwd m5, m13, m15 + punpcklwd m13, m15 + punpckhwd m3, m0, m7 + punpcklwd m0, m7 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m12, m4 + punpckhwd m12, m4 + punpcklwd m4, m5, m13 + punpckhwd m5, m13 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m12, m5 + punpckhdq m12, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m12, 0x31 + vinserti128 m3, xm12, 1 + call m(idct_16x8_internal_8bpc).main2 + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + add r0, 16 + add r3, 16 + %define dstq r0 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + RET + +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob + vpbroadcastd m9, [pw_5] + lea r4, [strideq*3] + sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) +.loop: + mova xm0,[cq+16* 0] + mova xm1, [cq+16* 4] + vinserti128 m0, [cq+16* 1], 1 + vinserti128 m1, [cq+16* 5], 1 + pxor m8, m8 + mova [cq+32*0], m8 + mova [cq+32*2], m8 + add cq, 16*16 + mova xm2, [cq-16* 8] + mova xm3, [cq-16* 4] + vinserti128 m2, [cq-16* 7], 1 + vinserti128 m3, [cq-16* 3], 1 + mova xm4, [cq+16* 0] + mova xm5, [cq+16* 4] + vinserti128 m4, [cq+16* 1], 1 + vinserti128 m5, [cq+16* 5], 1 + mova xm6, [cq+16* 8] + mova xm7, [cq+16*12] + vinserti128 m6, [cq+16* 9], 1 + vinserti128 m7, [cq+16*13], 1 + REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 + REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose8x8 + REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + sub cq, 16*16-32 + lea dstq, [dstq+r4*4] + add eobd, 0x80000000 + jnc .loop + RET +ALIGN function_align +.transpose8x8: + punpckhwd m8, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m8, m1 + punpckhdq m8, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + ret + +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob + add cq, 16*8 + vpbroadcastd m9, [pw_4096] + lea r4, [strideq*3] + lea r5, [dstq+strideq*4] + sub eobd, 107 +.loop: + mova xm0, [cq-16*8] + mova xm1, [cq-16*7] + vinserti128 m0, [cq+16*0], 1 + vinserti128 m1, [cq+16*1], 1 + mova xm2, [cq-16*6] + mova xm3, [cq-16*5] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m3, [cq+16*3], 1 + mova xm4, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm6, [cq-16*2] + mova xm7, [cq-16*1] + vinserti128 m6, [cq+16*6], 1 + vinserti128 m7, [cq+16*7], 1 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + %define dstq r5 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + add cq, 16*16 + add r0, 16 + add r5, 16 + add eobd, 0x80000000 + jnc .loop + RET + +%define o_base pw_5 + 128 + +%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs +%if %3 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [%1+%2* 0] + pmulhrsw m1, m15, [%1+%2* 1] + pmulhrsw m2, m15, [%1+%2* 2] + pmulhrsw m3, m15, [%1+%2* 3] + pmulhrsw m4, m15, [%1+%2* 4] + pmulhrsw m5, m15, [%1+%2* 5] + pmulhrsw m6, m15, [%1+%2* 6] + pmulhrsw m7, m15, [%1+%2* 7] + pmulhrsw m8, m15, [%1+%2* 8] + pmulhrsw m9, m15, [%1+%2* 9] + pmulhrsw m10, m15, [%1+%2*10] + pmulhrsw m11, m15, [%1+%2*11] + pmulhrsw m12, m15, [%1+%2*12] + pmulhrsw m13, m15, [%1+%2*13] + pmulhrsw m14, m15, [%1+%2*14] + pmulhrsw m15, [%1+%2*15] +%else + mova m0, [%1+%2* 0] + mova m1, [%1+%2* 1] + mova m2, [%1+%2* 2] + mova m3, [%1+%2* 3] + mova m4, [%1+%2* 4] + mova m5, [%1+%2* 5] + mova m6, [%1+%2* 6] + mova m7, [%1+%2* 7] + mova m8, [%1+%2* 8] + mova m9, [%1+%2* 9] + mova m10, [%1+%2*10] + mova m11, [%1+%2*11] + mova m12, [%1+%2*12] + mova m13, [%1+%2*13] + mova m14, [%1+%2*14] + mova m15, [%1+%2*15] +%endif + mova [rsp], m15 +%if %4 + pxor m15, m15 + REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15 +%endif +%endmacro + +%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 + pmovzxbw m%4, [dstq+%6] + pmulhrsw m%3, m%5 + pmulhrsw m%1, m%5 + paddw m%3, m%4 + pmovzxbw m%4, [r2+%7] + paddw m%1, m%4 + packuswb m%3, m%1 + vpermq m%3, m%3, q3120 + mova [dstq+%6], xm%3 + vextracti128 [r2+%7], m%3, 1 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3 + %undef cmp + LOAD_16ROWS cq, 64, 1 + call m(idct_16x16_internal_8bpc).main + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + lea tmp3q, [tmp1q+32*16] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp3q-32*4+ 0], xm0 + vextracti128 [tmp3q+32*0+ 0], m0, 1 + mova [tmp3q-32*3+ 0], xm2 + vextracti128 [tmp3q+32*1+ 0], m2, 1 + mova [tmp3q-32*2+ 0], xm4 + vextracti128 [tmp3q+32*2+ 0], m4, 1 + mova [tmp3q-32*1+ 0], xm6 + vextracti128 [tmp3q+32*3+ 0], m6, 1 + mova [tmp3q-32*4+16], xm8 + vextracti128 [tmp3q+32*0+16], m8, 1 + mova [tmp3q-32*3+16], xm10 + vextracti128 [tmp3q+32*1+16], m10, 1 + mova [tmp3q-32*2+16], xm12 + vextracti128 [tmp3q+32*2+16], m12, 1 + mova [tmp3q-32*1+16], xm14 + vextracti128 [tmp3q+32*3+16], m14, 1 + cmp eobd, 150 + jg .full + vinserti128 m0, m1, xm9, 1 + vperm2i128 m4, m1, m9, 0x31 + vinserti128 m2, m5, xm13, 1 + vperm2i128 m6, m5, m13, 0x31 + vinserti128 m1, m3, xm11, 1 + vperm2i128 m5, m3, m11, 0x31 + vinserti128 m3, m7, xm15, 1 + vperm2i128 m7, m7, m15, 0x31 + call .main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct16 +.dconly: + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.full: + mova [tmp1q-32*4], m1 + mova [tmp1q-32*3], m3 + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m7 + mova [tmp1q+32*0], m9 + mova [tmp1q+32*1], m11 + mova [tmp1q+32*2], m13 + mova [tmp1q+32*3], m15 + LOAD_16ROWS cq+32, 64, 1 + call m(idct_16x16_internal_8bpc).main + lea r2, [tmp3q+32*8] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [r2-32*4+ 0], xm0 + vextracti128 [r2+32*0+ 0], m0, 1 + mova [r2-32*3+ 0], xm2 + vextracti128 [r2+32*1+ 0], m2, 1 + mova [r2-32*2+ 0], xm4 + vextracti128 [r2+32*2+ 0], m4, 1 + mova [r2-32*1+ 0], xm6 + vextracti128 [r2+32*3+ 0], m6, 1 + mova [r2-32*4+16], xm8 + vextracti128 [r2+32*0+16], m8, 1 + mova [r2-32*3+16], xm10 + vextracti128 [r2+32*1+16], m10, 1 + mova [r2-32*2+16], xm12 + vextracti128 [r2+32*2+16], m12, 1 + mova [r2-32*1+16], xm14 + vextracti128 [r2+32*3+16], m14, 1 + vinserti128 m8, m1, xm9, 1 + vperm2i128 m12, m1, m9, 0x31 + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp1q+32*0], 1 + vinserti128 m1, [tmp1q+32*1], 1 + vinserti128 m10, m5, xm13, 1 + vperm2i128 m14, m5, m13, 0x31 + mova xm4, [tmp1q-32*4+16] + mova xm5, [tmp1q-32*3+16] + vinserti128 m4, [tmp1q+32*0+16], 1 + vinserti128 m5, [tmp1q+32*1+16], 1 + vinserti128 m9, m3, xm11, 1 + vperm2i128 m13, m3, m11, 0x31 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp1q+32*2], 1 + vinserti128 m3, [tmp1q+32*3], 1 + vinserti128 m11, m7, xm15, 1 + vperm2i128 m15, m7, m15, 0x31 + mova xm6, [tmp1q-32*2+16] + mova xm7, [tmp1q-32*1+16] + vinserti128 m6, [tmp1q+32*2+16], 1 + vinserti128 m7, [tmp1q+32*3+16], 1 + call .main_oddhalf + LOAD_8ROWS_H r2-32*4, 32 +.idct16: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +cglobal_label .main_oddhalf_fast ; lower half is zero + mova [rsp+gprsize+32*1], m7 + pxor m7, m7 + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m7 + vpbroadcastd m11, [o(pw_3703x8)] + vpbroadcastd m7, [o(pw_1751x8)] + vpbroadcastd m12, [o(pw_m1380x8)] + vpbroadcastd m8, [o(pw_3857x8)] + vpbroadcastd m13, [o(pw_3973x8)] + vpbroadcastd m15, [o(pw_995x8)] + pmulhrsw m11, m4 ; t29a + pmulhrsw m4, m7 ; t18a + pmulhrsw m12, m3 ; t19a + pmulhrsw m3, m8 ; t28a + pmulhrsw m13, m2 ; t27a + pmulhrsw m2, m15 ; t20a + vpbroadcastd m10, [o(pw_m2106x8)] + vpbroadcastd m7, [o(pw_3513x8)] + vpbroadcastd m9, [o(pw_3290x8)] + vpbroadcastd m8, [o(pw_2440x8)] + vpbroadcastd m14, [o(pw_m601x8)] + vpbroadcastd m15, [o(pw_4052x8)] + pmulhrsw m10, m5 ; t21a + pmulhrsw m5, m7 ; t26a + pmulhrsw m9, m6 ; t25a + pmulhrsw m6, m8 ; t22a + pmulhrsw m14, m1 ; t23a + pmulhrsw m1, m15 ; t24a + vpbroadcastd m15, [o(pd_2048)] + jmp .main2 +ALIGN function_align +cglobal_label .main_oddhalf + mova [rsp+gprsize+32*0], m15 + mova [rsp+gprsize+32*1], m7 + mova [rsp+gprsize+32*2], m8 + vpbroadcastd m15, [o(pd_2048)] + ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a + ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a + ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a +.main2: + psubsw m7, m12, m4 ; t18 + paddsw m12, m4 ; t19 + psubsw m4, m2, m10 ; t21 + paddsw m2, m10 ; t20 + psubsw m10, m14, m6 ; t22 + paddsw m14, m6 ; t23 + psubsw m6, m1, m9 ; t25 + paddsw m1, m9 ; t24 + psubsw m9, m13, m5 ; t26 + paddsw m13, m5 ; t27 + psubsw m5, m3, m11 ; t29 + paddsw m3, m11 ; t28 + ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a + ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a + ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a + psubsw m8, m14, m2 ; t20a + paddsw m14, m2 ; t23a + psubsw m2, m1, m13 ; t27a + paddsw m1, m13 ; t24a + psubsw m13, m6, m9 ; t21 + paddsw m6, m9 ; t22 + psubsw m9, m10, m4 ; t26 + paddsw m10, m4 ; t25 + ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27 + ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a + mova m4, [rsp+gprsize+32*0] ; in31 + mova [rsp+gprsize+32*0], m6 ; t22 + mova m6, [rsp+gprsize+32*1] ; in15 + mova [rsp+gprsize+32*1], m14 ; t23a + mova m14, [rsp+gprsize+32*2] ; in17 + mova [rsp+gprsize+32*2], m1 ; t24a + ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a + ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a + psubsw m1, m0, m14 ; t17 + paddsw m0, m14 ; t16 + psubsw m14, m4, m6 ; t30 + paddsw m4, m6 ; t31 + ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a + psubsw m6, m0, m12 ; t19a + paddsw m0, m12 ; t16a + psubsw m12, m4, m3 ; t28a + paddsw m4, m3 ; t31a + psubsw m3, m14, m5 ; t18 + paddsw m14, m5 ; t17 + psubsw m5, m1, m7 ; t29 + paddsw m1, m7 ; t30 + ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a + ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28 + psubsw m7, m1, m10 ; t25a + paddsw m1, m10 ; t30a + psubsw m10, m5, m9 ; t21 + paddsw m5, m9 ; t18 + psubsw m9, m12, m2 ; t20a + paddsw m12, m2 ; t19a + psubsw m2, m3, m13 ; t26 + paddsw m3, m13 ; t29 + psubsw m13, m6, m8 ; t27a + paddsw m6, m8 ; t28a + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m12 + mova [tmp2q+32*0], m6 + mova [tmp2q+32*1], m3 + mova [tmp2q+32*2], m1 + mova m5, [rsp+gprsize+32*0] ; t22 + mova m6, [rsp+gprsize+32*1] ; t23 + mova m3, [rsp+gprsize+32*2] ; t24a + psubsw m1, m14, m5 ; t22a + paddsw m14, m5 ; t17a + psubsw m5, m0, m6 ; t23 + paddsw m0, m6 ; t16 + psubsw m6, m4, m3 ; t24 + paddsw m4, m3 ; t31 + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m3, [o(pw_2896_2896)] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m14 + mova [tmp2q+32*3], m4 + ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27 + ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a + ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25 + ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a + mova [tmp1q+32*0], m13 + mova [tmp1q+32*1], m2 + mova [tmp1q+32*2], m7 + mova [tmp1q+32*3], m6 + mova [tmp2q-32*4], m5 + mova [tmp2q-32*3], m1 + mova [tmp2q-32*2], m10 + mova [tmp2q-32*1], m9 + ret +ALIGN function_align +.transpose_2x8x8_round: + punpckhwd m6, m12, m13 + punpcklwd m12, m13 + punpckhwd m13, m8, m9 + punpcklwd m8, m9 + punpckhwd m9, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m10, m11 + punpcklwd m10, m11 + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 + punpckhdq m11, m8, m10 + punpckldq m8, m10 + punpckldq m10, m12, m14 + punpckhdq m12, m14 + punpckhdq m14, m13, m15 + punpckldq m13, m15 + punpckldq m15, m6, m9 + punpckhdq m6, m9 + punpckhqdq m9, m8, m10 + punpcklqdq m8, m10 + punpcklqdq m10, m11, m12 + punpckhqdq m11, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m14, m6 + punpcklqdq m14, m6 + pmulhrsw m6, m7, [rsp+gprsize+32*0] + REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 + pmulhrsw m7, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m15 + vpbroadcastd m15, [o(pw_2048)] + IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 + IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m7, [rsp+gprsize+32*0] + mova m1, [rsp+gprsize+32*2] + IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 + ret + +; Perform the final sumsub step and YMM lane shuffling +%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2] + mova m%3, [tmp2q+32*( 3-%1)] + psubsw m%4, m%1, m%3 + paddsw m%1, m%3 + mova m%3, [tmp1q+32*(11-%2)] + mova [tmp1q+32*(11-%2)+16], xm%4 + vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 + paddsw m%4, m%2, m%3 + psubsw m%2, m%3 + mova [tmp1q+32*(11-%2)], xm%2 + vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 + vperm2i128 m%2, m%1, m%4, 0x31 + vinserti128 m%1, xm%4, 1 +%endmacro + +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 16 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly +.normal: + PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [cq+32* 1] + pmulhrsw m1, m15, [cq+32* 3] + pmulhrsw m2, m15, [cq+32* 5] + pmulhrsw m3, m15, [cq+32* 7] + pmulhrsw m4, m15, [cq+32* 9] + pmulhrsw m5, m15, [cq+32*11] + pmulhrsw m6, m15, [cq+32*13] + pmulhrsw m7, m15, [cq+32*15] + pmulhrsw m8, m15, [cq+32*17] + pmulhrsw m9, m15, [cq+32*19] + pmulhrsw m10, m15, [cq+32*21] + pmulhrsw m11, m15, [cq+32*23] + pmulhrsw m12, m15, [cq+32*25] + pmulhrsw m13, m15, [cq+32*27] + pmulhrsw m14, m15, [cq+32*29] + pmulhrsw m15, [cq+32*31] + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + LOAD_16ROWS cq+32*0, 32*2, 1, 0 + pxor m15, m15 + mov r3d, 8 +.zero_loop: + mova [cq+32*0], m15 + mova [cq+32*1], m15 + mova [cq+32*2], m15 + mova [cq+32*3], m15 + add cq, 32*4 + dec r3d + jg .zero_loop + call m(idct_16x16_internal_8bpc).main + call .pass1_end + lea r2, [strideq*3] + mov r3, dstq +.pass2: + vpbroadcastd m7, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + call m(idct_16x16_internal_8bpc).main + mova [rsp+32*2], m15 + vpbroadcastd m15, [o(pw_2048)] + REPX {pmulhrsw x, m15}, m2, m3, m0 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m4, m5, m6, m7 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m8, m9, m10, m11 + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m11, m12, m13, m14 + pmulhrsw m15, [rsp+32*2] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + test r3, r3 + jnz .right_half + RET +.right_half: + LOAD_8ROWS tmp1q-32*4, 32 + LOAD_8ROWS_H tmp2q-32*4, 32 + lea dstq, [r3+16] + xor r3d, r3d + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + jmp .pass2 +ALIGN function_align +.pass1_end: + mova [rsp+gprsize+32*0], m9 + IDCT32_PASS1_END 0, 8, 1, 9 + IDCT32_PASS1_END 2, 10, 1, 9 + IDCT32_PASS1_END 3, 11, 1, 9 + IDCT32_PASS1_END 4, 12, 1, 9 + IDCT32_PASS1_END 5, 13, 1, 9 + IDCT32_PASS1_END 6, 14, 1, 9 + IDCT32_PASS1_END 7, 15, 1, 9 + mova m1, [rsp+gprsize+32*1] + mova m9, [rsp+gprsize+32*0] + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*1], m7 + IDCT32_PASS1_END 1, 9, 6, 7 + ret + +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob +%undef cmp + lea r6, [o_base] + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m12, [o(pw_8192)] + cmp eobd, 43 ; if (eob > 43) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg al ; iteration_count++ + add eobd, -279 ; if (eob > 278) + adc r4b, al ; iteration_count++ + lea r3, [strideq*3] + mov r6, cq + paddw m11, m12, m12 ; pw_16384 +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jge .loop + sub cq, 32 + pxor m0, m0 + mov r0d, 8 + cmp cq, r6 + ja .zero_loop +.zero_loop_half: + mova [r6+64*0], m0 + mova [r6+64*1], m0 + add r6, 64*4 + mova [r6-64*2], m0 + mova [r6-64*1], m0 + sub r0d, 2 + jg .zero_loop_half + RET +.zero_loop: + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + mova [r6+32*3], m0 + add r6, 32*4 + dec r0d + jg .zero_loop + RET + +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob +%undef cmp + lea r6, [o_base] + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m11, [o(pw_2048)] + cmp eobd, 35 ; if (eob > 35) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg r3b ; iteration_count += 2 + lea r4d, [r4+r3*2] + lea r3, [strideq*3] + mov r5, dstq + mov r6, cq +.loop: + mova xm0, [cq+32* 0] + mova xm1, [cq+32* 1] + vinserti128 m0, [cq+32* 8], 1 + vinserti128 m1, [cq+32* 9], 1 + mova xm2, [cq+32* 2] + mova xm3, [cq+32* 3] + vinserti128 m2, [cq+32*10], 1 + vinserti128 m3, [cq+32*11], 1 + mova xm4, [cq+32* 4] + mova xm5, [cq+32* 5] + vinserti128 m4, [cq+32*12], 1 + vinserti128 m5, [cq+32*13], 1 + mova xm6, [cq+32* 6] + mova xm7, [cq+32* 7] + vinserti128 m6, [cq+32*14], 1 + vinserti128 m7, [cq+32*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jl .ret + test r4b, 1 + jz .loop + add cq, 32*15 + lea dstq, [r5+16] + jmp .loop +.ret: + sub cd, eax + pxor m0, m0 + add cd, 384 +.zero_loop: + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + mova [r6+32*3], m0 + add r6, 32*4 + sub cd, 128 + jge .zero_loop + RET + +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly +.normal: + PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + %undef cmp + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + sub eobd, 136 + mov tmp4d, eobd +.pass1_loop: + LOAD_8ROWS cq+64*1, 64*2 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test tmp4d, tmp4d + jl .fast + LOAD_8ROWS_H cq+64*17, 64*2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2 + pxor m0, m0 + REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + mova [rsp], m15 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + LOAD_8ROWS cq+64*0, 64*2 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + lea tmp3q, [tmp1q+32*32] + mova m15, [rsp] + mova [tmp3q-32*4], m0 + mova [tmp3q-32*3], m2 + mova [tmp3q-32*2], m4 + mova [tmp3q-32*1], m6 + mova [tmp3q+32*0], m8 + mova [tmp3q+32*1], m10 + mova [tmp3q+32*2], m12 + mova [tmp3q+32*3], m14 + add tmp3q, 32*8 + mova [tmp3q-32*4], m1 + mova [tmp3q-32*3], m3 + mova [tmp3q-32*2], m5 + mova [tmp3q-32*1], m7 + mova [tmp3q+32*0], m9 + mova [tmp3q+32*1], m11 + mova [tmp3q+32*2], m13 + mova [tmp3q+32*3], m15 + vpbroadcastd m9, [o(pw_8192)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*16 + add tmp2q, 32*16 + add eobd, 0x80000000 + jnc .pass1_loop + add tmp1q, 32*24 + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + test tmp4d, tmp4d + jge .pass2_loop + add tmp1q, 32*16 + add tmp2q, 32*16 + add tmp3q, 32*16 +.pass2_loop: + LOAD_8ROWS tmp2q-32*4, 32 + test tmp4d, tmp4d + jl .fast2 + LOAD_8ROWS_H tmp3q-32*4, 32 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + sub tmp3q, 32*8 + LOAD_8ROWS_H tmp3q-32*4, 32 + sub tmp3q, 32*16 + jmp .pass2_loop_end +.fast2: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + sub tmp3q, 32*24 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 +.pass2_loop_end: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end + lea tmp3q, [tmp1q-32*32] + cmp tmp2q, tmp3q + jb .ret + sub tmp2q, 32*32 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + jmp .pass2_loop +.ret: + RET + +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob + %undef cmp + vpbroadcastd m9, [pw_8192] + sub eobd, 136 ; if (eob < 136) + shr eobd, 30 ; topleft 16x16 only + lea eobd, [eobq*2-8] + lea r4, [strideq*3] + mov r5, dstq + lea r6, [cq+32] +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + add cq, 16 + inc eobd + jz .ret + test eobd, 3 + jnz .loop + add cq, 64*15 + lea dstq, [r5+16] + jmp .loop +.ret: + pxor m0, m0 + mov r0d, 16 + cmp cq, r6 + jne .zero_loop +.zero_loop_topleft: + mova [r6-32*1], m0 + mova [r6+32*1], m0 + mova [r6+32*3], m0 + mova [r6+32*5], m0 + add r6, 64*4 + sub r0d, 4 + jg .zero_loop_topleft + RET +.zero_loop: + mova [r6-32*1], m0 + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + add r6, 32*4 + dec r0d + jg .zero_loop + RET + +%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n + mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n +%else + mova m%5, [tmp1q-32*(45-%1)] + mova m%4, [tmp2q-32*(20+%1)] +%endif + psubsw m%6, m%5, m%4 ; idct32 out31-n + paddsw m%5, m%4 ; idct32 out 0+n + psubsw m%4, m%6, m%3 ; out32+n + paddsw m%6, m%3 ; out31-n + psubsw m%3, m%5, m%2 ; out63-n + paddsw m%5, m%2 ; out 0+n +%if %0 == 6 ; pass 1 +%if %1 & 1 + mova [tmp2q-32*(19-%1)], m%4 + mova [tmp1q-32*(14+%1)], m%6 + mova [tmp1q+32*(18-%1)], m%3 + mova [tmp2q-32*(51-%1)], m%5 +%else + mova [tmp1q-32*(13-%1)], m%4 + mova [tmp2q-32*(20+%1)], m%6 + mova [tmp2q+32*(12-%1)], m%3 + mova [tmp1q-32*(45-%1)], m%5 +%endif +%else ; pass 2 + REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + pmovzxbw m%2, [%%d0+%9 ] + paddw m%2, m%4 + pmovzxbw m%4, [%%d1+%8 ] + paddw m%4, m%6 + pmovzxbw m%6, [%%d1+%10] + paddw m%3, m%6 + pmovzxbw m%6, [%%d0+%7 ] + paddw m%5, m%6 + packuswb m%2, m%4 + packuswb m%3, m%5 + vpermq m%2, m%2, q3120 + vpermq m%3, m%3, q3120 + mova [%%d0+%9 ], xm%2 + vextracti128 [%%d1+%8 ], m%2, 1 + mova [%%d1+%10], xm%3 + vextracti128 [%%d0+%7 ], m%3, 1 +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.normal: + PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + %undef cmp + lea tmp1q, [rsp+32*23] + lea tmp2q, [tmp1q+32*24] + sub eobd, 151 + mov r7d, eobd +.pass1_loop: + LOAD_16ROWS cq, 64 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m2 + mova [tmp1q-32*2], m4 + mova [tmp1q-32*1], m6 + mova [tmp1q+32*0], m8 + mova [tmp1q+32*1], m10 + mova [tmp1q+32*2], m12 + mova [tmp1q+32*3], m14 + mova [tmp2q-32*4], m1 + mova [tmp2q-32*3], m3 + mova [tmp2q-32*2], m5 + mova [tmp2q-32*1], m7 + mova [tmp2q+32*0], m9 + mova [tmp2q+32*1], m11 + mova [tmp2q+32*2], m13 + mova [tmp2q+32*3], m15 + add cq, 32 + add tmp1q, 32*8 + add tmp2q, 32*8 + add eobd, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*23] + mova xm0, [r2-32*4+ 0] + mova xm1, [r2-32*2+ 0] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m1, [r2+32*2+ 0], 1 + mova xm2, [r2-32*4+16] + mova xm3, [r2-32*2+16] + vinserti128 m2, [r2+32*0+16], 1 + vinserti128 m3, [r2+32*2+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r7d, r7d + jl .fast + lea r3, [r2+32*8] + mova xm4, [r3-32*4+ 0] + mova xm5, [r3-32*2+ 0] + vinserti128 m4, [r3+32*0+ 0], 1 + vinserti128 m5, [r3+32*2+ 0], 1 + mova xm6, [r3-32*4+16] + mova xm7, [r3-32*2+16] + vinserti128 m6, [r3+32*0+16], 1 + vinserti128 m7, [r3+32*2+16], 1 +.fast: + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova xm0, [r2-32*3+ 0] + mova xm1, [r2-32*1+ 0] + vinserti128 m0, [r2+32*1+ 0], 1 + vinserti128 m1, [r2+32*3+ 0], 1 + mova xm2, [r2-32*3+16] + mova xm3, [r2-32*1+16] + vinserti128 m2, [r2+32*1+16], 1 + vinserti128 m3, [r2+32*3+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r7d, r7d + jl .fast2 + mova xm4, [r3-32*3+ 0] + mova xm5, [r3-32*1+ 0] + vinserti128 m4, [r3+32*1+ 0], 1 + vinserti128 m5, [r3+32*3+ 0], 1 + mova xm6, [r3-32*3+16] + mova xm7, [r3-32*1+16] + vinserti128 m6, [r3+32*1+16], 1 + vinserti128 m7, [r3+32*3+16], 1 +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + add r2, 32*24 + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova xm0, [r2-32*4+ 0] + mova xm3, [r2-32*1+16] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m3, [r2+32*3+16], 1 + mova xm4, [r2-32*4+16] + mova xm7, [r2-32*1+ 0] + vinserti128 m4, [r2+32*0+16], 1 + vinserti128 m7, [r2+32*3+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast3 + add r3, 32*24 + mova xm1, [r3-32*1+16] + mova xm2, [r3-32*4+ 0] + vinserti128 m1, [r3+32*3+16], 1 + vinserti128 m2, [r3+32*0+ 0], 1 + mova xm5, [r3-32*1+ 0] + mova xm6, [r3-32*4+16] + vinserti128 m5, [r3+32*3+ 0], 1 + vinserti128 m6, [r3+32*0+16], 1 +.fast3: + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova xm0, [r2-32*2+ 0] + mova xm3, [r2-32*3+16] + vinserti128 m0, [r2+32*2+ 0], 1 + vinserti128 m3, [r2+32*1+16], 1 + mova xm4, [r2-32*2+16] + mova xm7, [r2-32*3+ 0] + vinserti128 m4, [r2+32*2+16], 1 + vinserti128 m7, [r2+32*1+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast4 + mova xm1, [r3-32*3+16] + mova xm2, [r3-32*2+ 0] + vinserti128 m1, [r3+32*1+16], 1 + vinserti128 m2, [r3+32*2+ 0], 1 + mova xm5, [r3-32*3+ 0] + mova xm6, [r3-32*2+16] + vinserti128 m5, [r3+32*1+ 0], 1 + vinserti128 m6, [r3+32*2+16], 1 +.fast4: + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 + RET +ALIGN function_align +%define o_base idct64_mul - 8 +cglobal_label .main_part1 + ; idct64 steps 1-5: + ; in1/31/17/15/ 9/23/25/ 7 -> + ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a + ; in5/27/21/11/13/19/29/ 3 -> + ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a + vpbroadcastd m11, [o(idct64_mul+4* 0)] + vpbroadcastd m13, [o(idct64_mul+4* 1)] + vpbroadcastd m10, [o(idct64_mul+4* 4)] + vpbroadcastd m12, [o(idct64_mul+4* 5)] + pmulhrsw m11, m0 ; t63a + pmulhrsw m0, m13 ; t32a + pmulhrsw m10, m1 ; t62a + pmulhrsw m1, m12 ; t33a + vpbroadcastd m9, [o(idct64_mul+4* 8)] + vpbroadcastd m13, [o(idct64_mul+4* 9)] + vpbroadcastd m8, [o(idct64_mul+4*12)] + vpbroadcastd m12, [o(idct64_mul+4*13)] + pmulhrsw m9, m2 ; t61a + pmulhrsw m2, m13 ; t34a + pmulhrsw m8, m3 ; t60a + pmulhrsw m3, m12 ; t35a + psubsw m12, m0, m1 ; t33 + paddsw m0, m1 ; t32 + psubsw m1, m3, m2 ; t34 + paddsw m3, m2 ; t35 + psubsw m2, m8, m9 ; t61 + paddsw m8, m9 ; t60 + psubsw m9, m11, m10 ; t62 + paddsw m11, m10 ; t63 + ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a + vpbroadcastd m14, [o(pw_401_4076)] + ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a + psubsw m10, m0, m3 ; t35a + paddsw m0, m3 ; t32a + psubsw m3, m11, m8 ; t60a + paddsw m11, m8 ; t63a + psubsw m8, m9, m2 ; t34 + paddsw m9, m2 ; t33 + psubsw m2, m12, m1 ; t61 + paddsw m12, m1 ; t62 + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m9 + mova [tmp2q+32*2], m12 + mova [tmp2q+32*3], m11 + vpbroadcastd m13, [o(pw_m4017_799)] + vpbroadcastd m14, [o(pw_799_4017)] + ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a + ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp2q+32*0], m10 + mova [tmp2q+32*1], m8 + vpbroadcastd m3, [o(idct64_mul+4*16)] + vpbroadcastd m11, [o(idct64_mul+4*17)] + vpbroadcastd m2, [o(idct64_mul+4*20)] + vpbroadcastd m10, [o(idct64_mul+4*21)] + vpbroadcastd m1, [o(idct64_mul+4*24)] + vpbroadcastd m9, [o(idct64_mul+4*25)] + vpbroadcastd m0, [o(idct64_mul+4*28)] + vpbroadcastd m8, [o(idct64_mul+4*29)] + pmulhrsw m3, m4 ; t59a + pmulhrsw m4, m11 ; t36a + pmulhrsw m2, m5 ; t58a + pmulhrsw m5, m10 ; t37a + pmulhrsw m1, m6 ; t57a + pmulhrsw m6, m9 ; t38a + pmulhrsw m0, m7 ; t56a + pmulhrsw m7, m8 ; t39a + psubsw m8, m4, m5 ; t37 + paddsw m4, m5 ; t36 + psubsw m5, m7, m6 ; t38 + paddsw m7, m6 ; t39 + psubsw m6, m0, m1 ; t57 + paddsw m0, m1 ; t56 + psubsw m1, m3, m2 ; t58 + paddsw m3, m2 ; t59 + ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a + vpbroadcastd m10, [o(pw_3166_2598)] + ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a + psubsw m2, m7, m4 ; t36a + paddsw m7, m4 ; t39a + psubsw m4, m0, m3 ; t59a + paddsw m0, m3 ; t56a + psubsw m3, m6, m1 ; t37 + paddsw m6, m1 ; t38 + psubsw m1, m5, m8 ; t58 + paddsw m5, m8 ; t57 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + mova [tmp2q-32*4], m0 + mova [tmp2q-32*3], m5 + vpbroadcastd m6, [o(pw_m799_m4017)] + vpbroadcastd m7, [o(pw_m4017_799)] + ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59 + ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m1 + mova [tmp2q-32*2], m3 + mova [tmp2q-32*1], m2 + ret +%define o_base pw_5 + 128 +.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub + sub r6, o_idct64_offset + 8 + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m13, [o(pw_2896_2896)] + vpbroadcastd m14, [o(pw_m2896_2896)] +.main_part2_pass1_loop: + call .main_part2_internal + IDCT64_PART2_END 0, 7, 0, 6, 9, 10 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7 + cmp tmp1q, tmp2q + jne .main_part2_pass1_loop + ret +cglobal_label .main_part2_internal + mova m0, [tmp1q-32*12] ; t32a + mova m6, [tmp2q-32*13] ; t39a + mova m1, [tmp1q-32* 4] ; t40a + mova m5, [tmp2q+32* 3] ; t55a + add tmp1q, 32 + sub tmp2q, 32 + mova m2, [tmp1q+32* 3] ; t48a + mova m4, [tmp2q-32* 4] ; t47a + mova m3, [tmp1q+32*11] ; t56a + mova m7, [tmp2q+32*12] ; t63a + psubsw m8, m0, m6 ; t39 + paddsw m0, m6 ; t32 + psubsw m6, m4, m1 ; t40 + paddsw m4, m1 ; t47 + psubsw m1, m2, m5 ; t55 + paddsw m2, m5 ; t48 + psubsw m5, m7, m3 ; t56 + paddsw m7, m3 ; t63 + ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a + vpbroadcastd m9, [o(pw_m1567_m3784)] + ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a + psubsw m3, m0, m4 ; t47a + paddsw m0, m4 ; t32a + psubsw m4, m7, m2 ; t48a + paddsw m7, m2 ; t63a + psubsw m2, m5, m1 ; t40 + paddsw m5, m1 ; t39 + psubsw m1, m8, m6 ; t55 + paddsw m8, m6 ; t56 + ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48 + ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a + ret +.main_part2_pass2: + sub r6, o_idct64_offset + 8 + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m13, [o(pw_2896_2896)] + lea r9, [strideq*5] ; stride*5 + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + lea r8, [r3+strideq*2] ; stride*8 + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [o(pw_m2896_2896)] + call .main_part2_internal + vpbroadcastd m14, [o(pw_2048)] + IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp tmp1q, tmp2q + jne .main_part2_pass2_loop + ret + +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 16 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [o(pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m1, m1 +.dconly_loop: + mova m2, [dstq+32*0] + mova m3, [dstq+32*1] + punpckhbw m4, m2, m1 + punpcklbw m2, m1 + punpckhbw m5, m3, m1 + punpcklbw m3, m1 + paddw m4, m0 + paddw m2, m0 + paddw m5, m0 + paddw m3, m0 + packuswb m2, m4 + packuswb m3, m5 + mova [dstq+32*0], m2 + mova [dstq+32*1], m3 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + LOAD_8ROWS cq+32*0, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+32*2, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+32* 1] + mova m1, [cq+32*31] + mova m2, [cq+32*17] + mova m3, [cq+32*15] + mova m4, [cq+32* 9] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32* 7] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+32* 5] + mova m1, [cq+32*27] + mova m2, [cq+32*21] + mova m3, [cq+32*11] + mova m4, [cq+32*13] + mova m5, [cq+32*19] + mova m6, [cq+32*29] + mova m7, [cq+32* 3] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 + sub tmp1q, 32*36 + lea r2, [strideq*3] + mov tmp2d, 4 +.pass2_loop: + lea r3, [tmp1q-32*8] + mova xm0, [r3 -32*4] + mova xm1, [r3 -32*3] + vinserti128 m0, [tmp1q-32*4], 1 + vinserti128 m1, [tmp1q-32*3], 1 + mova xm2, [r3 -32*2] + mova xm3, [r3 -32*1] + vinserti128 m2, [tmp1q-32*2], 1 + vinserti128 m3, [tmp1q-32*1], 1 + mova xm4, [r3 +32*0] + mova xm5, [r3 +32*1] + vinserti128 m4, [tmp1q+32*0], 1 + vinserti128 m5, [tmp1q+32*1], 1 + mova xm6, [r3 +32*2] + mova xm7, [r3 +32*3] + vinserti128 m6, [tmp1q+32*2], 1 + vinserti128 m7, [tmp1q+32*3], 1 + mova xm8, [r3 -32*4+16] + mova xm9, [r3 -32*3+16] + vinserti128 m8, [tmp1q-32*4+16], 1 + vinserti128 m9, [tmp1q-32*3+16], 1 + mova xm10, [r3 -32*2+16] + mova xm11, [r3 -32*1+16] + vinserti128 m10, [tmp1q-32*2+16], 1 + vinserti128 m11, [tmp1q-32*1+16], 1 + mova xm12, [r3 +32*0+16] + mova xm13, [r3 +32*1+16] + vinserti128 m12, [tmp1q+32*0+16], 1 + vinserti128 m13, [tmp1q+32*1+16], 1 + mova xm14, [r3 +32*2+16] + mova xm15, [r3 +32*3+16] + vinserti128 m14, [tmp1q+32*2+16], 1 + vinserti128 m15, [tmp1q+32*3+16], 1 + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + call m(idct_16x16_internal_8bpc).main + mova [rsp+32*0], m15 + vpbroadcastd m15, [o(pw_2048)] + REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 + lea r3, [r3+strideq*4] + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + pmulhrsw m15, [rsp+32*0] + lea r3, [r3+strideq*4] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + add tmp1q, 32*16 + add r0, 16 + dec tmp2d + jg .pass2_loop + RET + +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly +.normal: + PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*7] + lea r10d, [eobq-136] + sar r10d, 31 +.pass1_loop: + lea tmp2q, [tmp1q+32*16] + LOAD_8ROWS cq+64*1, 64*2, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test r10b, r10b + jnz .fast + LOAD_8ROWS_H cq+64*17, 64*2, 2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2, 1 + mova [rsp], m15 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + LOAD_8ROWS cq+64*0, 64*2, 1 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end + vpbroadcastd m7, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + lea r3, [tmp1q+32*48] + mova m15, [rsp] + mova [r3-32*4], m0 + mova [r3-32*3], m2 + mova [r3-32*2], m4 + mova [r3-32*1], m6 + mova [r3+32*0], m8 + mova [r3+32*1], m10 + mova [r3+32*2], m12 + mova [r3+32*3], m14 + add r3, 32*24 + mova [r3-32*4], m1 + mova [r3-32*3], m3 + mova [r3-32*2], m5 + mova [r3-32*1], m7 + mova [r3+32*0], m9 + mova [r3+32*1], m11 + mova [r3+32*2], m13 + mova [r3+32*3], m15 + vpbroadcastd m9, [o(pw_16384)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*8 + add r10d, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*55] + lea r7, [r2+32*24] +.pass2_loop: + lea r3, [r2+32*8] + lea r8, [r7+32*8] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r10b, r10b + jnz .fast2 + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast2: + mova [rsp], m8 + lea tmp1q, [rsp+32*39] + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10b, r10b + jnz .fast3 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast3: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r7-32*4] + mova m3, [r7+32*3] + mova m4, [r7+32*0] + mova m7, [r7-32*1] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast4 + mova m1, [r8+32*3] + mova m2, [r8-32*4] + mova m5, [r8-32*1] + mova m6, [r8+32*0] +.fast4: + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r7-32*2] + mova m3, [r7+32*1] + mova m4, [r7+32*2] + mova m7, [r7-32*3] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast5 + mova m1, [r8+32*1] + mova m2, [r8-32*2] + mova m5, [r8-32*3] + mova m6, [r8+32*2] +.fast5: + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 + add r10d, 0x80000000 + jc .ret + lea r2, [rsp+32*7] + lea r7, [r2+32*16] + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + jmp .pass2_loop +.ret: + RET + +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly +.normal: + PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + lea tmp1q, [rsp+32*7] + lea tmp4d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + vpbroadcastd m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [cq+64* 1] + pmulhrsw m1, m7, [cq+64*31] + pmulhrsw m2, m7, [cq+64*17] + pmulhrsw m3, m7, [cq+64*15] + pmulhrsw m4, m7, [cq+64* 9] + pmulhrsw m5, m7, [cq+64*23] + pmulhrsw m6, m7, [cq+64*25] + pmulhrsw m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + pmulhrsw m0, m7, [cq+64* 5] + pmulhrsw m1, m7, [cq+64*27] + pmulhrsw m2, m7, [cq+64*21] + pmulhrsw m3, m7, [cq+64*11] + pmulhrsw m4, m7, [cq+64*13] + pmulhrsw m5, m7, [cq+64*19] + pmulhrsw m6, m7, [cq+64*29] + pmulhrsw m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave + add cq, 32 + add tmp4d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*15] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + mov tmp4b, 4 +.pass2_loop: + lea tmp2q, [tmp1q+32*64] + LOAD_8ROWS tmp1q-32*4, 32 + test tmp4d, 0x40000000 + jnz .fast + LOAD_8ROWS_H tmp2q-32*4, 32 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + lea tmp3q, [tmp2q-32*8] + LOAD_8ROWS_H tmp3q-32*4, 32 + mova [rsp], m15 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + lea tmp3q, [tmp1q-32*8] + LOAD_8ROWS tmp3q-32*4, 32 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end + add tmp1q, 32*16 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + dec tmp4b + jg .pass2_loop + RET +ALIGN function_align +.transpose_round_interleave: + mov tmp3d, 4 +.loop: + lea tmp2q, [tmp1q+32*8] + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp2q-32*4], 1 + vinserti128 m1, [tmp2q-32*3], 1 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp2q-32*2], 1 + vinserti128 m3, [tmp2q-32*1], 1 + mova xm4, [tmp1q+32*0] + mova xm5, [tmp1q+32*1] + vinserti128 m4, [tmp2q+32*0], 1 + vinserti128 m5, [tmp2q+32*1], 1 + mova xm6, [tmp1q+32*2] + mova xm7, [tmp1q+32*3] + vinserti128 m6, [tmp2q+32*2], 1 + vinserti128 m7, [tmp2q+32*3], 1 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova xm8, [tmp1q-32*4+16] + mova xm9, [tmp1q-32*3+16] + vinserti128 m8, [tmp2q-32*4+16], 1 + vinserti128 m9, [tmp2q-32*3+16], 1 + mova [tmp1q-32*4], m0 + mova [tmp2q-32*4], m1 + mova [tmp1q-32*3], m2 + mova [tmp2q-32*3], m3 + mova xm2, [tmp1q-32*2+16] + mova xm3, [tmp1q-32*1+16] + vinserti128 m2, [tmp2q-32*2+16], 1 + vinserti128 m3, [tmp2q-32*1+16], 1 + mova [tmp1q-32*2], m4 + mova [tmp2q-32*2], m5 + mova [tmp1q-32*1], m6 + mova [tmp2q-32*1], m7 + mova xm4, [tmp1q+32*0+16] + mova xm5, [tmp1q+32*1+16] + vinserti128 m4, [tmp2q+32*0+16], 1 + vinserti128 m5, [tmp2q+32*1+16], 1 + mova xm6, [tmp1q+32*2+16] + mova xm7, [tmp1q+32*3+16] + vinserti128 m6, [tmp2q+32*2+16], 1 + vinserti128 m7, [tmp2q+32*3+16], 1 + pmulhrsw m0, m8, m10 + pmulhrsw m1, m9, m10 + REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add tmp1q, 32*16 + dec tmp3d + jg .loop + ret + +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly +.normal: + PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*71] + lea r10d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + mova m4, [cq+64* 9] + mova m5, [cq+64*23] + mova m6, [cq+64*25] + mova m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + mova m4, [cq+64*13] + mova m5, [cq+64*19] + mova m6, [cq+64*29] + mova m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave + add cq, 32 + add r10d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*7] + mov r10b, 4 +.pass2_loop: + lea r2, [tmp1q+32*64] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + mova [rsp], m4 + test r10d, 0x40000000 + jnz .fast + lea r3, [r2+32*64] + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast: + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10d, 0x40000000 + jnz .fast2 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add r2, 32*8 + add r3, 32*8 + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r2-32*4] ; 1 + mova m3, [r2+32*3] ; 15 + mova m4, [r2+32*0] ; 9 + mova m7, [r2-32*1] ; 7 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast3 + mova m1, [r3+32*3] ; 31 + mova m2, [r3-32*4] ; 17 + mova m5, [r3-32*1] ; 23 + mova m6, [r3+32*0] ; 25 +.fast3: + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r2-32*2] ; 5 + mova m3, [r2+32*1] ; 11 + mova m4, [r2+32*2] ; 13 + mova m7, [r2-32*3] ; 3 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast4 + mova m1, [r3+32*1] ; 27 + mova m2, [r3-32*2] ; 21 + mova m5, [r3-32*3] ; 19 + mova m6, [r3+32*2] ; 29 +.fast4: + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 + sub tmp1q, 32*28 + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + dec r10b + jg .pass2_loop + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/itx_avx512.asm b/third_party/dav1d/src/x86/itx_avx512.asm new file mode 100644 index 0000000000..31c60fdd45 --- /dev/null +++ b/third_party/dav1d/src/x86/itx_avx512.asm @@ -0,0 +1,7389 @@ +; Copyright © 2020-2023, VideoLAN and dav1d authors +; Copyright © 2020-2023, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 +const \ +dup16_perm, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 + db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 + db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 + db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 +const \ +int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 + db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 + db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 + db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 +int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 + db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 + db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 + db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 +int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 + db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 + db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 + db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 +idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23 + db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55 + db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31 + db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63 +idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9 + db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17 + db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51 + db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35 +idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35 + db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21 + db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51 + db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37 +end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60 + db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61 + db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63 + db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62 + +; packed 4-bit qword shuffle indices +permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262 + dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373 + dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb + dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea +permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604 + dq 0xc824352d56128751, 0xd906171e74301e15 + dq 0x6271604b03472d62, 0x735342782165b426 + dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37 +permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486 + dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597 + dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e + dq 0x5115049dd9045b79, 0x733726bffb263d1f +permD: dq 0x0cda098800041504, 0x0edb09b2028c3726 + dq 0x0f11fa9c01150415, 0x0988f326039d2637 + dq 0x05640f1108269d8c, 0x05290edb0aaebfae + dq 0x0005000509378c9d, 0xffffffff0bbfaebf + +pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 +gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 +gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 +gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16 + +int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 +int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 +int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7 +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0 + +pb_32: times 4 db 32 +pw_2048: times 2 dw 2048 +pw_4096: times 2 dw 4096 +pw_8192: times 2 dw 8192 +pw_16384: times 2 dw 16384 +pw_1697x16: times 2 dw 1697*16 +pw_1697x8: times 2 dw 1697*8 +pw_2896x8: times 2 dw 2896*8 +pd_2048: dd 2048 + +%define pw_5 (permD+52) +%define pd_m1 (permD+60) +%define pw_3803_1321 (permD+44) +%define pw_2482_3803 (permD+12) +%define pw_2440_3290 (permD+ 4) +%define pw_m3290_2440 (permD+28) +%define pw_3857_1380 (permD+36) +%define pw_m1380_3857 (permD+20) + +pw_8192_m8192: dw 8192, -8192 +pw_m8192_8192: dw -8192, 8192 +pw_16384_m16384: dw 16384, -16384 +pw_m16384_16384: dw -16384, 16384 + +pw_m1321_2482: dw -1321, 2482 +pw_m3344_3344: dw -3344, 3344 +pw_2482_3344: dw 2482, 3344 +pw_m3803_3344: dw -3803, 3344 +pd_3344: dd 3344 +pw_m1321_m3344: dw -1321, -3344 +pw_2896_m2896: dw 2896, -2896 + +pw_1567_m3784: dw 1567, -3784 +pw_3784_m1567: dw 3784, -1567 +pw_4017_m799: dw 4017, -799 +pw_2276_m3406: dw 2276, -3406 +pw_m799_m4017: dw -799, -4017 +pw_m3406_m2276: dw -3406, -2276 + +%macro COEF_PAIR 2-3 0 +pw_%1_%2: dw %1, %2 +pw_m%2_%1: dw -%2, %1 +%if %3 +pw_m%1_m%2: dw -%1, -%2 +%endif +%endmacro + +COEF_PAIR 2896, 2896 +COEF_PAIR 1567, 3784, 1 +COEF_PAIR 3784, 1567 +COEF_PAIR 201, 4091 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 3035, 2751 +COEF_PAIR 3513, 2106 +COEF_PAIR 4052, 601 +COEF_PAIR 3166, 2598, 1 +COEF_PAIR 3920, 1189, 1 +COEF_PAIR 2276, 3406 +COEF_PAIR 4017, 799 + +%macro COEF_X8 1-* +%rep %0 + dw %1*8, %1*8 + %rotate 1 +%endrep +%endmacro + +pw_m2276x8: COEF_X8 -2276 +pw_3406x8: COEF_X8 3406 +pw_4017x8: COEF_X8 4017 +pw_799x8: COEF_X8 799 +pw_3784x8: COEF_X8 3784 +pw_1567x8: COEF_X8 1567 + +pw_4076x8: COEF_X8 4076 +pw_401x8: COEF_X8 401 +pw_m2598x8: COEF_X8 -2598 +pw_3166x8: COEF_X8 3166 +pw_3612x8: COEF_X8 3612 +pw_1931x8: COEF_X8 1931 +pw_m1189x8: COEF_X8 -1189 +pw_3920x8: COEF_X8 3920 + +pw_4091x8: COEF_X8 4091 +pw_201x8: COEF_X8 201 +pw_m2751x8: COEF_X8 -2751 +pw_3035x8: COEF_X8 3035 +pw_3703x8: COEF_X8 3703 +pw_1751x8: COEF_X8 1751 +pw_m1380x8: COEF_X8 -1380 +pw_3857x8: COEF_X8 3857 +pw_3973x8: COEF_X8 3973 +pw_995x8: COEF_X8 995 +pw_m2106x8: COEF_X8 -2106 +pw_3513x8: COEF_X8 3513 +pw_3290x8: COEF_X8 3290 +pw_2440x8: COEF_X8 2440 +pw_m601x8: COEF_X8 -601 +pw_4052x8: COEF_X8 4052 + +pw_401_4076x8: dw 401*8, 4076*8 +pw_m2598_3166x8: dw -2598*8, 3166*8 +pw_1931_3612x8: dw 1931*8, 3612*8 +pw_m1189_3920x8: dw -1189*8, 3920*8 +pw_799_4017x8: dw 799*8, 4017*8 +pw_m2276_3406x8: dw -2276*8, 3406*8 + +pw_201_4091x8: dw 201*8, 4091*8 +pw_m601_4052x8: dw -601*8, 4052*8 +pw_995_3973x8: dw 995*8, 3973*8 +pw_m1380_3857x8: dw -1380*8, 3857*8 +pw_1751_3703x8: dw 1751*8, 3703*8 +pw_m2106_3513x8: dw -2106*8, 3513*8 +pw_2440_3290x8: dw 2440*8, 3290*8 +pw_m2751_3035x8: dw -2751*8, 3035*8 + +pw_101_4095x8: dw 101*8, 4095*8 +pw_m2824_2967x8: dw -2824*8, 2967*8 +pw_1660_3745x8: dw 1660*8, 3745*8 +pw_m1474_3822x8: dw -1474*8, 3822*8 +pw_897_3996x8: dw 897*8, 3996*8 +pw_m2191_3461x8: dw -2191*8, 3461*8 +pw_2359_3349x8: dw 2359*8, 3349*8 +pw_m700_4036x8: dw -700*8, 4036*8 +pw_501_4065x8: dw 501*8, 4065*8 +pw_m2520_3229x8: dw -2520*8, 3229*8 +pw_2019_3564x8: dw 2019*8, 3564*8 +pw_m1092_3948x8: dw -1092*8, 3948*8 +pw_1285_3889x8: dw 1285*8, 3889*8 +pw_m1842_3659x8: dw -1842*8, 3659*8 +pw_2675_3102x8: dw 2675*8, 3102*8 +pw_m301_4085x8: dw -301*8, 4085*8 + +idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474 +COEF_PAIR 401, 4076, 1 +COEF_PAIR 799, 4017 + COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996 +dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017 + COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092 +COEF_PAIR 1931, 3612, 1 +COEF_PAIR 3406, 2276 + COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889 +dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276 + +SECTION .text + +%define o_base int8_permA+64*18 +%define o(x) (r5 - (o_base) + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, +; 16 = special_mul1, 32 = special_mul2 +%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags + mova m%2, m%4 +%if %7 & 16 + vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} + mova m%3, m%4 +%if %7 & 32 + vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} +%else + vpdpwssd m%3, m%1, m%6 +%endif +%elif %7 & 32 + vpdpwssd m%2, m%1, m%5 + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} +%elif %6 < 32 + vpdpwssd m%2, m%1, m%5 + mova m%3, m%4 + vpdpwssd m%3, m%1, m%6 +%elif %7 & 1 + vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} +%else + vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} +%endif +%if %7 & 2 + psrld m%2, 12 + pslld m%3, 4 + vpshrdd m%1, m%3, m%2, 16 +%elif %7 & 4 + ; compared to using shifts (as above) this has better throughput, + ; but worse latency and requires setting up the opmask/index + ; registers, so only use this method for the larger transforms + pslld m%1, m%2, 4 + vpmultishiftqb m%1{k7}, m13, m%3 +%else + psrad m%2, 12 + psrad m%3, 12 +%if %7 & 8 == 0 + packssdw m%1, m%3, m%2 +%endif +%endif +%endmacro + +; flags: same as ITX_MUL2X_PACK +%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags +%if %11 & 1 + vpbroadcastd m%4, [o(pw_%9_%10)] + vpbroadcastd m%4{k1}, [o(pw_%7_%8)] + vpbroadcastd m%5, [o(pw_m%10_%9)] + vpbroadcastd m%5{k1}, [o(pw_m%8_%7)] +%else + vpbroadcastd m%4, [o(pw_m%10_%9)] + vpbroadcastd m%4{k1}, [o(pw_m%8_%7)] + vpbroadcastd m%5, [o(pw_%9_%10)] + vpbroadcastd m%5{k1}, [o(pw_%7_%8)] +%endif + ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11 +%endmacro + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 + punpcklwd m%3, m%2, m%1 + punpckhwd m%2, m%1 +%if %7 < 32 + mova m%1, m%5 + vpdpwssd m%1, m%3, m%7 + mova m%4, m%5 + vpdpwssd m%4, m%2, m%7 +%else + mova m%1, m%5 + vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} + mova m%4, m%5 + vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} +%endif + psrad m%1, 12 + psrad m%4, 12 + packssdw m%1, m%4 + mova m%4, m%5 +%if %7 < 32 + vpdpwssd m%4, m%2, m%6 + mova m%2, m%5 + vpdpwssd m%2, m%3, m%6 +%else + vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} + mova m%2, m%5 + vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} +%endif + psrad m%4, 12 + psrad m%2, 12 +%if %0 == 8 + packssdw m%8, m%2, m%4 +%else + packssdw m%2, m%4 +%endif +%endmacro + +%macro WRAP_XMM 1+ + %xdefine %%reset RESET_MM_PERMUTATION + INIT_XMM cpuname + DEFINE_MMREGS xmm + AVX512_MM_PERMUTATION + %1 + %%reset +%endmacro + +%macro WRAP_YMM 1+ + INIT_YMM cpuname + %1 + INIT_ZMM cpuname +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + vpbroadcastd m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + movd m2, [%%row_adr1] + pinsrd m2, [%%row_adr2], 1 + movd m3, [%%row_adr3] + pinsrd m3, [%%row_adr4], 1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + movd [%%row_adr1], m0 + pextrd [%%row_adr2], m0, 1 + pextrd [%%row_adr3], m0, 2 + pextrd [%%row_adr4], m0, 3 + ret +%endmacro + +%macro INV_TXFM_FN 3 ; type1, type2, size +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base + %define %%p1 m(i%1_%3_internal_8bpc) + lea baseq, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%3_internal_8bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4 +%ifidn %1_%2, dct_dct + vpbroadcastw m0, [cq] + vpbroadcastd m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [cq], eobd + pmulhrsw m0, m1 + mova m1, m0 + jmp m(iadst_4x4_internal_8bpc).end2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0 + vpbroadcastd m4, [o(pd_2048)] + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 + ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 +%endmacro + +%macro IADST4_1D_PACKED 0 + punpcklwd m4, m1, m0 ; in2 in0 + punpckhwd m5, m1, m0 ; in3 in1 +.main2: + vpbroadcastd m3, [o(pd_2048)] + mova m0, m3 + vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd} + mova m2, m3 + vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd} + mova m1, m3 + vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd} + vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd} + vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd} + vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd} + vpdpwssd m1, m5, [o(pd_3344)] {bcstd} + vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd} + REPX {psrad x, 12}, m0, m2, m1, m3 + packssdw m0, m2 ; out0 out1 + packssdw m1, m3 ; out2 out3 +%endmacro + +INIT_XMM avx512icl +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + IDCT4_1D_PACKED + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + pxor ymm16, ymm16 + mova [cq], ymm16 + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call .main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + pxor ymm16, ymm16 + mova [cq], ymm16 +.end2: + ITX4_END 0, 1, 2, 3 +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call m(iadst_4x4_internal_8bpc).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x4_internal_8bpc).main +.end: + pxor ymm16, ymm16 + mova [cq], ymm16 +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8bpc).end + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8 +%ifidn %1_%2, dct_dct + movd xmm1, [o(pw_2896x8)] + pmulhrsw xmm0, xmm1, [cq] + movd xmm2, [o(pw_2048)] + pmulhrsw xmm0, xmm1 + pmulhrsw xmm0, xmm1 + pmulhrsw xmm0, xmm2 + vpbroadcastw ym0, xmm0 + mova ym1, ym0 + jmp m(iadst_4x8_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT8_1D_PACKED 0 + punpckhwd m5, m3, m0 ; in7 in1 + punpckhwd m4, m1, m2 ; in3 in5 + punpcklwd m3, m1 ; in6 in2 + punpcklwd m2, m0 ; in4 in0 +.main2: + vpbroadcastd m6, [o(pd_2048)] + ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 + psubsw m0, m5, m4 ; t5a t6a (interleaved) + paddsw m4, m5 ; t4 t7 (interleaved) + ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 + ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5 +%if mmsize > 16 + vbroadcasti32x4 m1, [o(deint_shuf)] + pshufb m4, m1 +%else + pshufb m4, [o(deint_shuf)] +%endif + psubsw m1, m2, m3 ; tmp3 tmp2 + paddsw m3, m2 ; tmp0 tmp1 + punpckhqdq m2, m4, m0 ; t7 t6 + punpcklqdq m4, m0 ; t4 t5 + paddsw m0, m3, m2 ; out0 out1 + psubsw m3, m2 ; out7 out6 + psubsw m2, m1, m4 ; out4 out5 + paddsw m1, m4 ; out3 out2 +%endmacro + +%macro IADST8_1D_PACKED 1 ; pass + vpbroadcastd m6, [o(pd_2048)] +%if %1 == 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a + psubsw m4, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a +%if mmsize > 16 + vbroadcasti32x4 m2, [o(deint_shuf)] +%else + mova m2, [o(deint_shuf)] +%endif + vprord m1, 16 + psubsw m3, m0, m1 ; t3 t2 + paddsw m0, m1 ; -out7 out0 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + pshufb m0, m2 + pshufb m4, m2 + mova m2, m6 + vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd} + mova m5, m6 + vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd} + psrad m2, 12 + psrad m5, 12 + packssdw m2, m5 ; out4 -out5 + mova m5, m6 + vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd} + mova m3, m6 + vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd} + psrad m5, 12 + psrad m3, 12 + packssdw m1, m3, m5 ; out2 -out3 +%else + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a + psubsw m4, m0, m2 ; t4 t5 + paddsw m0, m2 ; t0 t1 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + shufps m2, m5, m4, q1032 + punpckhwd m4, m2 + punpcklwd m5, m2 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a + psubsw m2, m0, m1 ; t2 t3 + paddsw m0, m1 ; out0 -out7 + psubsw m1, m4, m5 ; t6 t7 + paddsw m4, m5 ; -out1 out6 + vpbroadcastd m5, [o(pw_2896x8)] + punpckhqdq m3, m2, m1 ; t3 t7 + punpcklqdq m2, m1 ; t2 t6 + paddsw m1, m2, m3 ; t2+t3 t6+t7 + psubsw m2, m3 ; t2-t3 t6-t7 + punpckhqdq m3, m4, m0 ; out6 -out7 + punpcklqdq m0, m4 ; out0 -out1 + pmulhrsw m2, m5 ; out4 -out5 + pshufd m1, m1, q1032 + pmulhrsw m1, m5 ; out2 -out3 +%endif +%endmacro + +INIT_YMM avx512icl +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + IDCT4_1D_PACKED + vbroadcasti32x4 m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + vextracti32x4 xm2, m0, 1 + vextracti32x4 xm3, m1, 1 + call .main + vpbroadcastd m4, [o(pw_2048)] + vinserti32x4 m0, m0, xm2, 1 + vinserti32x4 m1, m1, xm3, 1 + pshufd m1, m1, q1032 + jmp m(iadst_4x8_internal_8bpc).end2 +ALIGN function_align +.main: + WRAP_XMM IDCT8_1D_PACKED + ret + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + vextracti32x4 xm2, m0, 1 + vextracti32x4 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call .main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vinserti32x4 m0, xm2, 1 + vinserti32x4 m1, xm3, 1 + pxor m5, m5 + psubw m5, m4 +.end: + punpcklqdq m4, m5 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + vpbroadcastd m3, strided + pmulld m5, m3, [o(pd_0to15)] + kxnorb k1, k1, k1 + kmovb k2, k1 + vpgatherdd m3{k1}, [dstq+m5] + pxor m4, m4 + mova [cq], zmm20 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpscatterdd [dstq+m5]{k2}, m0 + RET +ALIGN function_align +.main_pass1: + punpckhwd xm0, xm4, xm3 ; 0 7 + punpckhwd xm1, xm5, xm2 ; 2 5 + punpcklwd xm2, xm5 ; 4 3 + punpcklwd xm3, xm4 ; 6 1 + WRAP_XMM IADST8_1D_PACKED 1 + punpcklqdq xm3, xm4, xm0 ; out6 -out7 + punpckhqdq xm0, xm4 ; out0 -out1 + ret +ALIGN function_align +.main_pass2: + WRAP_XMM IADST8_1D_PACKED 2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + jmp tx2q +.pass2: + vextracti32x4 xm2, m0, 1 + vextracti32x4 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call m(iadst_4x8_internal_8bpc).main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vinserti32x4 m3, xm1, 1 + vinserti32x4 m2, xm0, 1 + pxor m4, m4 + psubw m4, m5 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + jmp m(iadst_4x8_internal_8bpc).end + +INIT_ZMM avx512icl +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd m0, [o(pw_2896x8)] + pmulhrsw m0, [cq] + mova m1, [o(int8_permB)] + vpbroadcastd m2, [o(pw_1697x8)] + vpermb m0, m1, m0 + pmulhrsw m2, m0 + paddsw m0, m2 + vextracti32x8 ym1, m0, 1 + jmp tx2q +.pass2: + vpbroadcastd ym4, [o(pw_4096)] + jmp m(iadst_4x8_internal_8bpc).end2 + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + imul r6d, 181 + add r6d, 128+2048 + sar r6d, 8+4 + vpbroadcastw m0, r6d + mova m1, m0 + jmp m(iadst_4x16_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT16_1D_PACKED 0 + punpckhwd m8, m7, m0 ; dct16 in15 in1 + punpcklwd m9, m4, m0 ; dct4 in2 in0 + punpckhwd m0, m3, m4 ; dct16 in7 in9 + punpcklwd m7, m1 ; dct8 in7 in1 + punpckhwd m1, m6 ; dct16 in3 in13 + punpcklwd m3, m5 ; dct8 in3 in5 + punpckhwd m5, m2 ; dct16 in11 in5 + punpcklwd m6, m2 ; dct4 in3 in1 +cglobal_label .main2 + vpbroadcastd m10, [o(pd_2048)] +.main3: + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 ; 0x33... + ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a + ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a + ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a + ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a + ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a + ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a +.main4: + psubsw m2, m8, m0 ; t9 t14 + paddsw m8, m0 ; t8 t15 + psubsw m4, m1, m5 ; t10 t13 + paddsw m1, m5 ; t11 t12 + ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2 + psubsw m0, m8, m1 ; t11a t12a + paddsw m8, m1 ; t8a t15a + psubsw m1, m7, m3 ; t5a t6a + paddsw m7, m3 ; t4 t7 +.main5: + ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a + ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a +%if mmsize > 16 + vbroadcasti32x4 m5, [o(deint_shuf)] +%else + mova m5, [o(deint_shuf)] +%endif + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + paddsw m3, m2, m4 ; t9 t14 + psubsw m2, m4 ; t10 t13 + pshufb m8, m5 + pshufb m7, m5 + pshufb m3, m5 + ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 + ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 + ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a + punpckhqdq m2, m7, m1 ; t7 t6 + punpcklqdq m7, m1 ; t4 t5 + psubsw m1, m9, m6 ; dct4 out3 out2 + paddsw m9, m6 ; dct4 out0 out1 + packssdw m5, m11 ; t12 t13a + packssdw m4, m0 ; t11 t10a + punpckhqdq m0, m8, m3 ; t15a t14 + punpcklqdq m8, m3 ; t8a t9 + psubsw m3, m9, m2 ; dct8 out7 out6 + paddsw m9, m2 ; dct8 out0 out1 + psubsw m2, m1, m7 ; dct8 out4 out5 + paddsw m1, m7 ; dct8 out3 out2 + psubsw m7, m9, m0 ; out15 out14 + paddsw m0, m9 ; out0 out1 + psubsw m6, m1, m5 ; out12 out13 + paddsw m1, m5 ; out3 out2 + psubsw m5, m2, m4 ; out11 out10 + paddsw m2, m4 ; out4 out5 + psubsw m4, m3, m8 ; out8 out9 + paddsw m3, m8 ; out7 out6 +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova ym1, [cq+32*2] + vinserti32x8 m1, [cq+32*0], 1 + mova m0, [o(int16_perm)] + mova ym2, [cq+32*3] + vinserti32x8 m2, [cq+32*1], 1 + vpbroadcastd m4, [o(pd_2048)] + vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3 + vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3 + ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2 + ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2 + vpbroadcastd m4, [o(pw_16384)] + psubsw m3, m1, m2 + paddsw m1, m2 ; out0 out1 + vprord m3, 16 ; out2 out3 + punpckldq m0, m1, m3 + punpckhdq m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + jmp tx2q +.pass2: + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, ym1, 1 + vextracti32x4 xm4, m0, 2 + vextracti32x4 xm5, m1, 2 + vextracti32x4 xm6, m0, 3 + vextracti32x4 xm7, m1, 3 + call .main + vinserti32x4 ym0, xm2, 1 + vinserti32x4 ym1, xm3, 1 + vinserti32x4 ym4, xm6, 1 + vinserti32x4 ym5, xm7, 1 + vinserti32x8 m0, ym4, 1 + vinserti32x8 m1, ym5, 1 + vpbroadcastd m5, [o(pw_2048)] + pshufd m1, m1, q1032 + jmp m(iadst_4x16_internal_8bpc).end2 +ALIGN function_align +.main: + WRAP_XMM IDCT16_1D_PACKED + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m1, [o(permB)] + vpermq m0, m1, [cq+64*0] + vpermq m1, m1, [cq+64*1] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m3, [o(pw_16384)] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m3 + pmulhrsw m0, m3 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + call .main + vpbroadcastd m5, [o(pw_2048)] + psrlq m10, 4 + psubw m6, m8, m5 +.end: + vpbroadcastd m7, [o(pw_2896x8)] + paddsw ym1, ym2, ym4 + psubsw ym2, ym4 + vinserti32x8 m1, ym2, 1 + pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10 + psrlq m0, m10, 4 + vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d + vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f + punpcklqdq m5, m6 +.end2: + pmulhrsw m0, m5 + pmulhrsw m1, m5 +.end3: + vpbroadcastd m3, strided + pmulld m5, m3, [o(pd_0to15)] + kxnorw k1, k1, k1 + kmovw k2, k1 + vpgatherdd m3{k1}, [dstq+m5] + pxor m4, m4 + mova [cq+64*0], m4 + mova [cq+64*1], m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpscatterdd [dstq+m5]{k2}, m0 + RET +ALIGN function_align +.main: + movu m3, [o(permB+1)] + psrlq m10, m3, 4 +.main2: + vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10 + vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5 + vpbroadcastd m9, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + kxnorb k1, k1, k1 + punpckhwd m4, m3, m0 ; in12 in3 in14 in1 + punpcklwd m0, m3 ; in0 in15 in2 in13 + kshiftrb k1, k1, 4 + vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5 + vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9 +INIT_YMM avx512icl + vpcmpub k7, m13, m9, 6 ; 0x33... + pxor m8, m8 + ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5 + ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5 + ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5 + ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5 + psubsw m2, m0, m3 ; t9a t8a t11a t10a + paddsw m0, m3 ; t1a t0a t3a t2a + psubsw m3, m1, m4 ; t13a t12a t15a t14a + paddsw m4, m1 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5 + psubw m7, m8, m7 + ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4 + vpbroadcastd m6, [o(pw_3784_m1567)] + vpbroadcastd m6{k1}, [o(pw_m3784_1567)] + psubsw m1, m0, m4 ; t5 t4 t7 t6 + paddsw m0, m4 ; t1 t0 t3 t2 + psubsw m4, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a + ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14 + vbroadcasti32x4 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a + vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a + vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14 + vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13 + pshufd m2, m2, q1032 ; t6a t7a t14 t15 + psubsw m4, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m1, m2 ; out12 -out3 -out13 out2 + psubsw m1, m2 ; t7 t6 t15a t14a + punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a + punpcklqdq m4, m1 ; t3a t7 t11 t15a +INIT_ZMM avx512icl + vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m1, [o(permB)] + vpermq m0, m1, [cq+64*0] + vpermq m1, m1, [cq+64*1] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m3, [o(pw_16384)] + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_8bpc).main + vpbroadcastd m6, [o(pw_2048)] + psrlq m10, 12 + psubw m5, m8, m6 + jmp m(iadst_4x16_internal_8bpc).end + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m2, [o(int16_perm)] + vpermb m1, m2, [cq+64*0] + vpermb m2, m2, [cq+64*1] + vpbroadcastd m4, [o(pw_1697x8)] + vpbroadcastd m0, [o(pd_m1)] + pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is + vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal + pmulhrsw m4, m2 ; it still works, but if the input is -1 the + vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes + vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless + vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here. + punpckldq m0, m1, m2 + punpckhdq m1, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x16)] + vpbroadcastd m5, [o(pw_2048)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m0 + paddsw m1, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x16_internal_8bpc).end2 + +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3] + movq xm%3, [dstq ] + movhps xm%3, [dstq+%5] + movq xm%4, [dstq+%6] + movhps xm%4, [dstq+%7] + pmovzxbw m%3, xm%3 + pmovzxbw m%4, xm%4 +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vextracti32x4 xm%4, m%3, 1 + movq [dstq ], xm%3 + movhps [dstq+%6], xm%3 + movq [dstq+%5], xm%4 + movhps [dstq+%7], xm%4 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_8x4_internal_8bpc).end3 +%endif +%endmacro + +INIT_YMM avx512icl +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd xm3, [o(pw_2896x8)] + pmulhrsw xm0, xm3, [cq+16*0] + pmulhrsw xm1, xm3, [cq+16*1] + pmulhrsw xm2, xm3, [cq+16*2] + pmulhrsw xm3, [cq+16*3] + call m(idct_4x8_internal_8bpc).main + vbroadcasti32x4 m4, [o(deint_shuf)] + vinserti32x4 m3, m1, xm3, 1 + vinserti32x4 m1, m0, xm2, 1 + shufps m0, m1, m3, q0220 + shufps m1, m3, q1331 + pshufb m0, m4 + pshufb m1, m4 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti32x4 m0, xm2, 1 + vinserti32x4 m1, xm3, 1 + pxor m3, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + psubsw m3, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +.end2: + vpbroadcastd m2, [o(pw_2048)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +.end3: + pxor m2, m2 + mova [cq], zmm18 + lea r6, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + RET +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti32x4 m3, m3, xm1, 1 + vinserti32x4 m2, m2, xm0, 1 + punpckhwd m1, m3, m2 + punpcklwd m3, m2 + pxor m0, m0 + psubsw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_8bpc).main + mova m2, m1 + vpermq m1, m0, q2031 + vpermq m0, m2, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm0, [cq+16*1] + vinserti32x4 m2, [cq+16*2], 1 + vinserti32x4 m0, [cq+16*3], 1 + vpbroadcastd m3, [o(pw_2896x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + paddsw m0, m0 + paddsw m1, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_8x4_internal_8bpc).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8 +%ifidn %1_%2, dct_dct +INIT_ZMM avx512icl + movsx r6d, word [cq] + mov [cq], eobd +.dconly: + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 +.dconly2: + vpbroadcastd ym2, strided + imul r6d, 181 + pmulld ym5, ym2, [o(pd_0to15)] + kxnorb k1, k1, k1 + add r6d, 128+2048 + sar r6d, 8+4 + pxor m3, m3 + vpbroadcastw m4, r6d +.dconly_loop: + kmovb k2, k1 + vpgatherdq m2{k1}, [dstq+ym5] + punpcklbw m0, m2, m3 + punpckhbw m1, m2, m3 + paddw m0, m4 + paddw m1, m4 + packuswb m0, m1 + kmovb k1, k2 + vpscatterdq [dstq+ym5]{k2}, m0 + lea dstq, [dstq+strideq*8] + sub r3d, 8 + jg .dconly_loop + RET +INIT_YMM avx512icl +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m1, [cq+32*1], q3120 ; 2 3 + call .main + shufps m4, m0, m1, q0220 + shufps m5, m0, m1, q1331 + shufps m1, m2, m3, q0220 + shufps m3, m2, m3, q1331 + vbroadcasti32x4 m0, [o(deint_shuf)] + vpbroadcastd m2, [o(pw_16384)] + REPX {pshufb x, m0}, m4, m5, m1, m3 + REPX {pmulhrsw x, m2}, m4, m5, m1, m3 + vinserti32x4 m0, m4, xm1, 1 + vshufi32x4 m2, m4, m1, 0x03 + vinserti32x4 m1, m5, xm3, 1 + vshufi32x4 m3, m5, m3, 0x03 + jmp tx2q +.pass2: + call .main + vpbroadcastd m4, [o(pw_2048)] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + jmp m(iadst_8x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + IDCT8_1D_PACKED + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call .main_pass1 + vpbroadcastd m5, [o(pw_16384_m16384)] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + REPX {pmulhrsw x, m5}, m3, m4, m0, m1 + vshufi32x4 m2, m3, m0, 0x03 + vinserti32x4 m0, m3, xm0, 1 + vshufi32x4 m3, m4, m1, 0x03 + vinserti32x4 m1, m4, xm1, 1 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call .main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vpbroadcastd xm4, [o(pw_4096)] + psubw m4, m5 ; lower half = 2048, upper half = -2048 +.end: + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + pmulhrsw m2, m4 + pmulhrsw m3, m4 +.end4: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + lea r6, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 5 + RET +ALIGN function_align +.main_pass1: + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 + IADST8_1D_PACKED 1 + punpcklqdq m3, m4, m0 ; out6 -out7 + punpckhqdq m0, m4 ; out0 -out1 + ret +ALIGN function_align +cglobal_label .main_pass2 + IADST8_1D_PACKED 2 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call m(iadst_8x8_internal_8bpc).main_pass1 + vpbroadcastd m5, [o(pw_m16384_16384)] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + punpckhwd m0, m4, m3 + punpcklwd m4, m3 + punpckhwd m3, m2, m1 + punpcklwd m2, m1 + REPX {pmulhrsw x, m5}, m0, m4, m3, m2 + vinserti32x4 m1, m0, xm3, 1 + vshufi32x4 m3, m0, m3, 0x03 + vinserti32x4 m0, m4, xm2, 1 + vshufi32x4 m2, m4, m2, 0x03 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vpbroadcastd xm5, [o(pw_4096)] + psubw m4, m5 ; lower half = -2048, upper half = 2048 + vpermq m5, m3, q2031 + vpermq m3, m0, q2031 + vpermq m0, m2, q2031 + vpermq m2, m1, q2031 + pmulhrsw m1, m0, m4 + pmulhrsw m0, m5, m4 + jmp m(iadst_8x8_internal_8bpc).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*1] + vinserti32x4 m3, [cq+16*4], 1 + vinserti32x4 m2, [cq+16*5], 1 + mova xm4, [cq+16*2] + mova xm0, [cq+16*3] + vinserti32x4 m4, [cq+16*6], 1 + vinserti32x4 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_8x8_internal_8bpc).end + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 16 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly +%endif +%endmacro + +%macro ITX_8X16_LOAD_COEFS 0 + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m0, m4, [cq+32*0] + add cq, 32*4 + pmulhrsw m7, m4, [cq+32*3] + pmulhrsw m1, m4, [cq-32*3] + pmulhrsw m6, m4, [cq+32*2] + pmulhrsw m2, m4, [cq-32*2] + pmulhrsw m5, m4, [cq+32*1] + pmulhrsw m3, m4, [cq-32*1] + pmulhrsw m4, [cq+32*0] +%endmacro + +INIT_ZMM avx512icl +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m3, [o(permB)] + vpermq m0, m3, [cq+64*0] + vpbroadcastd m4, [o(pw_2896x8)] + vpermq m1, m3, [cq+64*1] + vpermq m2, m3, [cq+64*2] + vpermq m3, m3, [cq+64*3] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3 + punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3 + punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3 + punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3 + REPX {pmulhrsw x, m5}, m4, m0, m2, m1 + punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3 + punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1 + punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3 + punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1 + punpckhdq m1, m0, m2 ; 1 5 9 13 + punpckldq m0, m2 ; 0 4 8 12 + punpckldq m2, m3, m4 ; 2 6 10 14 + punpckhdq m3, m4 ; 3 7 11 15 + jmp tx2q +.pass2: + vprord m5, [o(int16_perm)], 16 + vshufi32x4 m2, m2, q1320 ; 2 10 14 6 + vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 + vshufi32x4 m1, m3, q0132 ; 9 13 7 3 + vpermb m9, m5, m0 + vpermb m7, m5, m2 + vpermb m8, m5, m4 + vpermb m0, m5, m1 + vextracti32x8 ym6, m9, 1 + vextracti32x8 ym3, m7, 1 + vextracti32x8 ym5, m8, 1 + vextracti32x8 ym1, m0, 1 + call .main2 + mova ym8, [o(gather8a)] + lea r3, [dstq+strideq*4] + pmovzxdq m9, ym8 + pshufd ym8, ym8, q1230 + vpermt2q m0, m9, m4 + vpermt2q m1, m9, m5 + vpermt2q m2, m9, m6 + vpermt2q m3, m9, m7 +.end: + vpbroadcastd m7, [o(pw_2048)] +.end2: + pmulhrsw m0, m7 + pmulhrsw m1, m7 +.end3: + pmulhrsw m2, m7 + pmulhrsw m3, m7 +.end4: + vpbroadcastd ym6, strided + kxnorb k1, k1, k1 + pxor m4, m4 + pmulld ym8, ym6 + kmovb k2, k1 + vpgatherdq m6{k1}, [dstq+ym8] + kmovb k1, k2 + vpgatherdq m7{k2}, [r3+ym8] + mova [cq+64*0], m4 + mova [cq+64*1], m4 + kmovb k2, k1 + mova [cq+64*2], m4 + mova [cq+64*3], m4 + punpcklbw m5, m6, m4 + punpckhbw m6, m4 + paddw m0, m5 + paddw m1, m6 + packuswb m0, m1 + vpscatterdq [dstq+ym8]{k1}, m0 + punpcklbw m6, m7, m4 + punpckhbw m7, m4 + paddw m2, m6 + paddw m3, m7 + packuswb m2, m3 + vpscatterdq [r3+ym8]{k2}, m2 + RET +ALIGN function_align +cglobal_label .main_fast2 ; bottom three-quarters are zero + vpbroadcastd ym10, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + vpbroadcastd ym3, [o(pw_401_4076x8)] + vpbroadcastd ym5, [o(pw_799_4017x8)] + vpbroadcastd ym4, [o(pw_m1189_3920x8)] + pxor ym6, ym6 + punpckhwd ym2, ym0, ym0 + pmulhrsw ym2, ym3 ; t8a t15a + punpcklwd ym7, ym1, ym1 + pmulhrsw ym7, ym5 ; t4a t7a + punpckhwd ym1, ym1 + pmulhrsw ym4, ym1 ; t11a t12a + vpcmpub k7, ym13, ym10, 6 + punpcklwd ym9, ym6, ym0 + psubsw ym0, ym2, ym4 ; t11a t12a + paddsw ym8, ym2, ym4 ; t8a t15a + mova ym1, ym7 + jmp .main5 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + vpbroadcastd ym10, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + pxor ym6, ym6 + punpckhwd ym8, ym0, ym0 + punpckhwd ym4, ym3, ym3 + punpckhwd ym5, ym2, ym2 + punpcklwd ym7, ym1, ym1 + punpckhwd ym1, ym1 + punpcklwd ym3, ym3 + punpcklwd ym9, ym6, ym0 + punpcklwd ym6, ym2 + vpbroadcastd ym2, [o(pw_401_4076x8)] + vpbroadcastd ym0, [o(pw_m2598_3166x8)] + vpbroadcastd ym11, [o(pw_1931_3612x8)] + vpbroadcastd ym12, [o(pw_m1189_3920x8)] + pmulhrsw ym8, ym2 ; t8a t15a + vpbroadcastd ym2, [o(pw_799_4017x8)] + pmulhrsw ym0, ym4 ; t9a t14a + vpbroadcastd ym4, [o(pw_m2276_3406x8)] + pmulhrsw ym5, ym11 ; t10a t13a + pmulhrsw ym1, ym12 ; t11a t12a + pmulhrsw ym7, ym2 ; t4a t7a + pmulhrsw ym3, ym4 ; t5a t6a + vpcmpub k7, ym13, ym10, 6 + jmp .main4 +ALIGN function_align +cglobal_label .main + WRAP_YMM IDCT16_1D_PACKED + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_16x8_internal_8bpc).main_pass1 + vbroadcasti32x4 m6, [o(int_shuf1)] + vpbroadcastd m7, [o(pw_16384_m16384)] + punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 + pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 + pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 +.pass1_end: + REPX {pmulhrsw x, m7}, m3, m5, m4, m2 + punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m5 + punpckhqdq m3, m5 + jmp tx2q +.pass2: + call .main_pass2 + vpbroadcastd m6, [o(pw_2048)] + psrlq m10, 4 + psubw m7, m8, m6 +.pass2_end: + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m5, m2 ; out8 -out11 -out9 out10 + mova ym8, [o(gather8c)] + lea r3, [dstq+strideq] + psrlq m2, m10, 4 + vpermi2q m2, m0, m3 ; 1 3 13 15 + vpermt2q m0, m10, m3 ; 0 2 12 14 + psrlq m3, m10, 8 + vpermi2q m3, m1, m5 ; 5 7 9 11 + psrlq m10, 12 + vpermt2q m1, m10, m5 ; 4 6 8 10 + pmulhrsw m0, m6 + pmulhrsw m1, m6 + jmp m(idct_8x16_internal_8bpc).end3 +ALIGN function_align +.main_pass1: + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m5, m2, [cq+64*0] + pmulhrsw m3, m2, [cq+64*3] + pmulhrsw m1, m2, [cq+64*1] + pmulhrsw m2, [cq+64*2] + movu m4, [o(permA+3)] + psrlq m10, m4, 4 + mova m6, m4 + vpermi2q m4, m5, m3 ; in0 in12 in2 in14 + vpermt2q m5, m10, m3 ; in15 in3 in13 in1 + vpermi2q m6, m1, m2 ; in4 in8 in6 in10 + vpermt2q m1, m10, m2 ; in11 in7 in9 in5 + jmp .main +ALIGN function_align +.main_pass2: + mova m4, [o(permC)] + psrlq m5, m4, 4 + vpermi2q m4, m0, m2 ; in0 in12 in2 in14 + psrlq m6, m5, 4 + vpermi2q m5, m1, m3 ; in15 in3 in13 in1 + psrlq m10, m6, 4 + vpermi2q m6, m0, m2 ; in4 in8 in6 in10 + vpermt2q m1, m10, m3 ; in11 in7 in9 in5 +.main: + punpcklwd m0, m4, m5 ; in0 in15 in2 in13 + punpckhwd m4, m5 ; in12 in3 in14 in1 + punpcklwd m5, m6, m1 ; in4 in11 in6 in9 + punpckhwd m6, m1 ; in8 in7 in10 in5 +cglobal_label .main2 + vpbroadcastd m9, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + kxnorb k1, k1, k1 + vpcmpub k7, m13, m9, 6 ; 0x33... + pxor m8, m8 + ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 + ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5 + ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5 + ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5 + psubsw m2, m0, m6 ; t9a t8a t11a t10a + paddsw m0, m6 ; t1a t0a t3a t2a + psubsw m3, m5, m4 ; t13a t12a t15a t14a + paddsw m5, m4 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5 + psubw m7, m8, m7 + ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4 + vpbroadcastd m6, [o(pw_3784_m1567)] + vpbroadcastd m6{k1}, [o(pw_m3784_1567)] + psubsw m1, m0, m5 ; t5 t4 t7 t6 + paddsw m0, m5 ; t1 t0 t3 t2 + psubsw m4, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a + ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15 + vbroadcasti32x4 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a + vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a + vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15 + vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12 + pshufd m2, m2, q1032 ; t7a t6a t15 t14 + psubsw m4, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m1, m2 ; out12 -out3 -out13 out2 + psubsw m1, m2 ; t7 t6 t15a t14a + punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a + punpcklqdq m4, m1 ; t3a t7 t11 t15a + ret + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_16x8_internal_8bpc).main_pass1 + vbroadcasti32x4 m6, [o(int_shuf2)] + vpbroadcastd m7, [o(pw_m16384_16384)] + punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 + pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 + pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 + jmp m(iadst_8x16_internal_8bpc).pass1_end +.pass2: + call m(iadst_8x16_internal_8bpc).main_pass2 + vpbroadcastd m7, [o(pw_2048)] + psrlq m10, 36 + psubw m6, m8, m7 + jmp m(iadst_8x16_internal_8bpc).pass2_end + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [o(int16_perm)] + vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 + vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 + vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 + vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 + vpbroadcastd m5, [o(pw_2896x8)] + punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1 + punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3 + REPX {pmulhrsw x, m5}, m1, m2, m3, m4 + punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 + punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1 + punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2 + punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 + jmp tx2q +.pass2: + vpbroadcastd m7, [o(pw_1697x16)] + mova ym8, [o(gather8b)] + lea r3, [dstq+strideq*2] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_8x16_internal_8bpc).end + +%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] + pmovzxbw m%3, [dstq+%5] +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif + pmovzxbw m%4, [dstq+%6] +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vpermq m%3, m%3, q3120 + mova [dstq+%5], xm%3 + vextracti32x4 [dstq+%6], m%3, 1 +%endmacro + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2 +%endif +%endmacro + +INIT_ZMM avx512icl +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] + mova xm4, [cq+16*4] + mova xm5, [cq+16*5] + mova xm6, [cq+16*6] + mova xm7, [cq+16*7] + call m(idct_4x16_internal_8bpc).main + vpbroadcastd m8, [o(pw_16384)] + vinserti32x4 ym1, xm3, 1 ; 3 2 7 6 + vinserti32x4 ym5, xm7, 1 ; b a f e + vinserti32x4 ym0, xm2, 1 ; 0 1 4 5 + vinserti32x4 ym4, xm6, 1 ; 8 9 c d + vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e + vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d + pmulhrsw m1, m8 + pmulhrsw m0, m8 + pshufd m1, m1, q1032 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + mova m2, [o(permA)] + jmp m(iadst_16x4_internal_8bpc).end + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+64*0] + mova m1, [cq+64*1] + movshdup m3, [o(permB)] + psrlq m10, m3, 4 + call m(iadst_4x16_internal_8bpc).main2 + vpbroadcastd m6, [o(pw_16384_m16384)] + psrlq m0, m10, 4 + psrlq m10, 8 +.pass1_end: + punpcklwd ym5, ym4, ym2 + punpckhwd ym4, ym2 + vinserti32x8 m5, ym4, 1 + mova m1, m9 + vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} + mova m4, m9 + vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16} + psrad m1, 12 + psrad m4, 12 + packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5 + vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d + vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + pmulhrsw m0, m6 + pmulhrsw m1, m6 + jmp tx2q +.pass2: + call .main + movu m2, [o(permA+1)] +.end: + vpbroadcastd m3, [o(pw_2048)] + pmulhrsw m0, m3 + pmulhrsw m1, m3 +.end2: + psrlq m3, m2, 4 + vpermi2q m2, m0, m1 + vpermi2q m3, m0, m1 +.end3: + lea r3, [dstq+strideq*2] + mova xm1, [dstq+strideq*0] + vinserti32x4 ym1, [dstq+strideq*1], 1 + vinserti32x4 m1, [r3 +strideq*0], 2 + vinserti32x4 m1, [r3 +strideq*1], 3 + pxor m4, m4 + mova [cq+64*0], m4 + mova [cq+64*1], m4 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [r3 +strideq*0], m0, 2 + vextracti32x4 [r3 +strideq*1], m0, 3 + RET +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+64*0] + mova m1, [cq+64*1] + movshdup m3, [o(permB)] + psrlq m10, m3, 4 + call m(iadst_4x16_internal_8bpc).main2 + vpbroadcastd m6, [o(pw_m16384_16384)] + psrlq m0, m10, 12 + psrlq m10, 16 + jmp m(iadst_16x4_internal_8bpc).pass1_end +.pass2: + call m(iadst_16x4_internal_8bpc).main + movu m2, [o(permA+2)] + jmp m(iadst_16x4_internal_8bpc).end + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m1, [cq+64*0] + mova m2, [cq+64*1] + vpbroadcastd m3, [o(pw_1697x16)] + vpbroadcastd m4, [o(pw_16384)] + mova m5, [o(idtx_16x4p)] + shufps m0, m1, m2, q2020 + shufps m1, m2, q3131 + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddsw m0, m2 + paddsw m1, m3 + vpermb m0, m5, m0 + vpermb m1, m5, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + movu m2, [o(permA+1)] + jmp m(iadst_16x4_internal_8bpc).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 8 +.dconly: + imul r6d, 181 + add r6d, 128 + sar r6d, 8 +.dconly2: + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 +.dconly3: + imul r6d, 181 + lea r2, [strideq*3] + add r6d, 128+2048 + sar r6d, 8+4 + pxor m2, m2 + vpbroadcastw m3, r6d +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti32x4 ym1, [dstq+strideq*1], 1 + vinserti32x4 m1, [dstq+strideq*2], 2 + vinserti32x4 m1, [dstq+r2 ], 3 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + paddw m0, m3 + paddw m1, m3 + packuswb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub r3d, 4 + jg .dconly_loop + RET +%endif +%endmacro + +%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd + vpbroadcastd m8, [o(pw_2896x8)] + vpermq m0, [cq+32*0], q3120 + add cq, 32*4 + vpermq m7, [cq+32*3], q%1 + vpermq m1, [cq-32*3], q%1 + vpermq m6, [cq+32*2], q3120 + vpermq m2, [cq-32*2], q3120 + vpermq m5, [cq+32*1], q%1 + vpermq m3, [cq-32*1], q%1 + vpermq m4, [cq+32*0], q3120 + REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst + +cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd m1, [o(pw_2896x8)] + vpermq m0, [cq+64*0], q3120 + vpermq m2, [cq+64*1], q3120 + vpermq m4, [cq+64*2], q3120 + vpermq m6, [cq+64*3], q3120 + REPX {pmulhrsw x, m1}, m0, m2, m4, m6 + vextracti32x8 ym1, m0, 1 + vextracti32x8 ym3, m2, 1 + vextracti32x8 ym5, m4, 1 + vextracti32x8 ym7, m6, 1 + call m(idct_8x16_internal_8bpc).main + vbroadcasti32x4 m8, [o(int_shuf1)] + vbroadcasti32x4 m9, [o(int_shuf2)] + vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3 + vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3 + vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3 + vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3 + vpbroadcastd m2, [o(pw_16384)] + pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3 + pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3 + pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3 + pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3 + REPX {pmulhrsw x, m2}, m0, m1, m6, m7 + punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 + jmp tx2q +.pass2: + vshufi32x4 m0, m2, m4, q2020 ; 0 1 + vshufi32x4 m2, m4, q3131 ; 4 5 + vshufi32x4 m1, m3, m5, q2020 ; 2 3 + vshufi32x4 m3, m5, q3131 ; 6 7 + call .main + movshdup m4, [o(permC)] + psrlq m6, m4, 4 + vpermq m5, m4, q1032 + vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3 + vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1 + psrlq m6, m5, 4 + vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3 + vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1 + vpbroadcastd m6, [o(pw_2048)] +.end: + REPX {pmulhrsw x, m6}, m0, m4, m1, m5 +.end2: + lea r3, [dstq+strideq*4] + lea r4, [strideq*3] + mova xm3, [dstq+strideq*0] + mova xm6, [dstq+strideq*2] + vinserti32x4 ym3, [dstq+strideq*1], 1 + vinserti32x4 ym6, [dstq+r4 ], 1 + vinserti32x4 m3, [r3 +strideq*0], 2 + vinserti32x4 m6, [r3 +strideq*2], 2 + vinserti32x4 m3, [r3 +strideq*1], 3 + vinserti32x4 m6, [r3 +r4 ], 3 + pxor m7, m7 + mova [cq+64*0], m7 + mova [cq+64*1], m7 + mova [cq+64*2], m7 + mova [cq+64*3], m7 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m4, m3 + packuswb m0, m4 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [r3 +strideq*0], m0, 2 + vextracti32x4 [r3 +strideq*1], m0, 3 + punpcklbw m3, m6, m7 + punpckhbw m6, m7 + paddw m1, m3 + paddw m5, m6 + packuswb m1, m5 + mova [dstq+strideq*2], xm1 + vextracti32x4 [dstq+r4 ], ym1, 1 + vextracti32x4 [r3 +strideq*2], m1, 2 + vextracti32x4 [r3 +r4 ], m1, 3 + RET +ALIGN function_align +cglobal_label .main + IDCT8_1D_PACKED + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_8x16_internal_8bpc).main_pass1 + vpbroadcastd m7, [o(pw_16384_m16384)] + psrlq m10, 4 +.pass1_end: + punpcklwd m5, m4, m2 + punpckhwd m4, m2 + mova m1, m9 + vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} + mova m6, m9 + vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16} + mova m2, m9 + vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16} + vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16} + psrad m1, 12 + psrad m6, 12 + packssdw m1, m6 ; out8 -out7 -out9 out6 + psrad m2, 12 + psrad m9, 12 + packssdw m2, m9 ; -out11 out4 out10 -out5 + psrlq m4, m10, 4 + vpermi2q m4, m0, m2 + vpermt2q m0, m10, m2 + psrlq m5, m10, 8 + vpermi2q m5, m1, m3 + psrlq m10, 12 + vpermt2q m1, m10, m3 + punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3 + punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3 + punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3 + punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3 + punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3 + REPX {pmulhrsw x, m7}, m2, m3, m4, m5 + jmp tx2q +.pass2: + vshufi32x4 m0, m2, m4, q2020 + vshufi32x4 m2, m4, q3131 ; 4 5 + vshufi32x4 m1, m3, m5, q2020 + vshufi32x4 m3, m5, q3131 ; 6 7 + pshufd m4, m0, q1032 ; 1 0 + pshufd m5, m1, q1032 ; 3 2 + call .main_pass2 + movshdup m4, [o(permC)] + pmulhrsw m0, m6 + pmulhrsw m1, m6 + psrlq m6, m4, 4 + mova m5, m4 + vpermi2q m4, m0, m2 + vpermt2q m0, m6, m2 + vpermi2q m5, m1, m3 + vpermt2q m1, m6, m3 + jmp m(idct_16x8_internal_8bpc).end2 +ALIGN function_align +.main_pass1: + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m3, m4, [cq+64*0] + pmulhrsw m1, m4, [cq+64*3] + pmulhrsw m2, m4, [cq+64*1] + pmulhrsw m4, [cq+64*2] + mova m5, [o(int16_perm)] + kxnorb k1, k1, k1 + vpblendmd m0{k1}, m1, m3 ; 0 7 + vmovdqa32 m3{k1}, m1 ; 6 1 + vpblendmd m1{k1}, m4, m2 ; 2 5 + vmovdqa32 m2{k1}, m4 ; 4 3 + REPX {vpermb x, m5, x}, m0, m1, m2, m3 + IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + IADST8_1D_PACKED 2 + pxor m5, m5 + psubd m5, m6 + packssdw m6, m5 + pmulhrsw m2, m6 + pmulhrsw m3, m6 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_8x16_internal_8bpc).main_pass1 + vpbroadcastd m7, [o(pw_m16384_16384)] + psrlq m10, 20 + jmp m(iadst_16x8_internal_8bpc).pass1_end +.pass2: + vshufi32x4 m0, m2, m4, q2020 + vshufi32x4 m2, m4, q3131 ; 4 5 + vshufi32x4 m1, m3, m5, q2020 + vshufi32x4 m3, m5, q3131 ; 6 7 + pshufd m4, m0, q1032 ; 1 0 + pshufd m5, m1, q1032 ; 3 2 + call m(iadst_16x8_internal_8bpc).main_pass2 + movshdup m4, [o(permC)] + pmulhrsw m5, m6, m0 + pmulhrsw m0, m6, m1 + psrlq m1, m4, 12 + psrlq m4, 8 + mova m7, m4 + vpermi2q m4, m0, m3 + vpermt2q m0, m1, m3 + vpermi2q m1, m5, m2 + vpermt2q m5, m7, m2 + jmp m(idct_16x8_internal_8bpc).end2 + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd m0, [o(pw_2896x8)] + pmulhrsw m3, m0, [cq+64*0] + pmulhrsw m4, m0, [cq+64*1] + pmulhrsw m5, m0, [cq+64*2] + pmulhrsw m0, [cq+64*3] + vpbroadcastd m7, [o(pw_1697x16)] + vpbroadcastd m8, [o(pw_16384)] + shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5 + shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7 + shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5 + shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7 + mova m9, [o(int8_permA)] + pmulhrsw m0, m7, m2 + pmulhrsw m1, m7, m3 + pmulhrsw m6, m7, m4 + pmulhrsw m7, m5 + REPX {pmulhrsw x, m8}, m0, m1, m6, m7 + paddsw m2, m0 + paddsw m3, m1 + paddsw m4, m6 + paddsw m5, m7 + REPX {vpermb x, m9, x}, m2, m3, m4, m5 + jmp tx2q +.pass2: + mova m7, [o(permB)] + vpbroadcastd m6, [o(pw_4096)] + vpermq m0, m7, m2 + vpermq m4, m7, m4 + vpermq m1, m7, m3 + vpermq m5, m7, m5 + jmp m(idct_16x8_internal_8bpc).end + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 16 + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst + +cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m7, [o(permB)] + vpermq m0, m7, [cq+64*0] + vpermq m1, m7, [cq+64*1] + vpermq m2, m7, [cq+64*2] + vpermq m3, m7, [cq+64*3] + vpermq m4, m7, [cq+64*4] + vpermq m5, m7, [cq+64*5] + vpermq m6, m7, [cq+64*6] + vpermq m7, m7, [cq+64*7] + call .main + vbroadcasti32x4 m12, [o(int_shuf1)] + vbroadcasti32x4 m11, [o(int_shuf2)] + vpbroadcastd m13, [o(pw_8192)] + pshufb m0, m12 + pshufb m8, m1, m11 + pshufb m2, m12 + pshufb m9, m3, m11 + pshufb m4, m12 + pshufb m10, m5, m11 + pshufb m6, m12 + pshufb m11, m7, m11 + REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11 + punpckhdq m1, m0, m8 + punpckldq m0, m8 + punpckhdq m3, m2, m9 + punpckldq m2, m9 + punpckhdq m5, m4, m10 + punpckldq m4, m10 + punpckhdq m7, m6, m11 + punpckldq m6, m11 + jmp tx2q +.pass2: + vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc + vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 + vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec + vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 + vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me + vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 + vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee + vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 + vshufi32x4 m3, m1, m5, q3131 ; 6 7 + vshufi32x4 m1, m5, q2020 ; 2 3 + vshufi32x4 m5, m7, m9, q2020 ; 10 11 + vshufi32x4 m7, m9, q3131 ; 14 15 + call .main + mova m8, [o(permD)] + psrlq m12, m8, 4 + psrlq m9, m8, 8 + psrlq m13, m8, 12 + mova m10, m8 + vpermi2q m8, m0, m2 ; 0 1 4 5 + vpermt2q m0, m12, m2 + mova m11, m9 + vpermi2q m9, m1, m3 ; 2 3 6 7 + vpermt2q m1, m13, m3 + vpermi2q m10, m4, m6 ; 8 9 12 13 + vpermt2q m4, m12, m6 + vpermi2q m11, m5, m7 ; 10 11 14 15 + vpermt2q m5, m13, m7 +.end: + vpbroadcastd m12, [o(pw_2048)] +.end2: + REPX {pmulhrsw x, m12}, m0, m1, m4, m5 +.end3: + REPX {pmulhrsw x, m12}, m8, m9, m10, m11 + lea r3, [strideq*3] + lea r4, [dstq+strideq*4] + lea r5, [dstq+strideq*8] + lea r6, [r4 +strideq*8] + mova xm3, [dstq+strideq*0] + mova xm6, [dstq+strideq*2] + vinserti32x4 ym3, [dstq+strideq*1], 1 + vinserti32x4 ym6, [dstq+r3 ], 1 + vinserti32x4 m3, [r4+strideq*0], 2 + vinserti32x4 m6, [r4+strideq*2], 2 + vinserti32x4 m3, [r4+strideq*1], 3 + vinserti32x4 m6, [r4+r3 ], 3 + mova xm12, [r5+strideq*0] + mova xm13, [r5+strideq*2] + vinserti32x4 ym12, [r5+strideq*1], 1 + vinserti32x4 ym13, [r5+r3 ], 1 + vinserti32x4 m12, [r6+strideq*0], 2 + vinserti32x4 m13, [r6+strideq*2], 2 + vinserti32x4 m12, [r6+strideq*1], 3 + vinserti32x4 m13, [r6+r3 ], 3 + pxor m7, m7 + REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m8, m3 + packuswb m0, m8 + punpcklbw m2, m6, m7 + punpckhbw m6, m7 + paddw m1, m2 + paddw m9, m6 + packuswb m1, m9 + punpcklbw m2, m12, m7 + punpckhbw m12, m7 + paddw m2, m4 + paddw m10, m12 + packuswb m2, m10 + punpcklbw m3, m13, m7 + punpckhbw m13, m7 + paddw m3, m5 + paddw m11, m13 + packuswb m3, m11 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + mova [dstq+strideq*2], xm1 + vextracti32x4 [dstq+r3 ], ym1, 1 + vextracti32x4 [r4+strideq*0], m0, 2 + vextracti32x4 [r4+strideq*1], m0, 3 + vextracti32x4 [r4+strideq*2], m1, 2 + vextracti32x4 [r4+r3 ], m1, 3 + mova [r5+strideq*0], xm2 + vextracti32x4 [r5+strideq*1], ym2, 1 + mova [r5+strideq*2], xm3 + vextracti32x4 [r5+r3 ], ym3, 1 + vextracti32x4 [r6+strideq*0], m2, 2 + vextracti32x4 [r6+strideq*1], m2, 3 + vextracti32x4 [r6+strideq*2], m3, 2 + vextracti32x4 [r6+r3 ], m3, 3 + RET +ALIGN function_align +cglobal_label .main_fast2 ; bottom three-quarters are zero + vpbroadcastd m10, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 +.main_fast4: + vpbroadcastd m2, [o(pw_401_4076x8)] + vpbroadcastd m4, [o(pw_m1189_3920x8)] + vpbroadcastd m3, [o(pw_799_4017x8)] + pmulhrsw m2, m8 ; t8a t15a + pmulhrsw m4, m1 ; t11a t12a + pmulhrsw m7, m3 ; t4a t7a + pxor m6, m6 + psubsw m0, m2, m4 ; t11a t12a + paddsw m8, m2, m4 ; t8a t15a + mova m1, m7 + jmp .main5 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + vpbroadcastd m10, [o(pd_2048)] +.main_fast3: + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 +.main_fast5: + vpbroadcastd m2, [o(pw_401_4076x8)] + vpbroadcastd m4, [o(pw_m2598_3166x8)] + vpbroadcastd m11, [o(pw_1931_3612x8)] + vpbroadcastd m12, [o(pw_m1189_3920x8)] + pmulhrsw m8, m2 ; t8a t15a + vpbroadcastd m2, [o(pw_799_4017x8)] + pmulhrsw m0, m4 ; t9a t14a + vpbroadcastd m4, [o(pw_m2276_3406x8)] + pmulhrsw m5, m11 ; t10a t13a + pmulhrsw m1, m12 ; t11a t12a + pmulhrsw m7, m2 ; t4a t7a + pmulhrsw m3, m4 ; t5a t6a + jmp .main4 +ALIGN function_align +cglobal_label .main + IDCT16_1D_PACKED + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call .main_pass1 + vpbroadcastd m10, [o(pw_8192_m8192)] + punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3 + punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3 + punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3 + punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3 + punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3 + punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3 + punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3 + punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3 + punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 +.pass1_end: + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call .main_pass2 + mova m10, [o(permD)] + psrlq m8, m10, 8 + psrlq m12, m10, 12 + psrlq m13, m10, 4 + mova m9, m8 + vpermi2q m8, m0, m2 ; 0 1 4 5 + vpermt2q m0, m12, m2 + vpermi2q m9, m1, m3 ; 2 3 6 7 + vpermt2q m1, m12, m3 + vpbroadcastd m12, [o(pw_2048)] + mov r3d, 0xff00ff00 + mova m11, m10 + vpermi2q m10, m4, m6 ; 8 9 12 13 + vpermt2q m4, m13, m6 + kmovd k1, r3d + vpermi2q m11, m5, m7 ; 10 11 14 15 + vpermt2q m5, m13, m7 + pxor m7, m7 + vpsubw m12{k1}, m7, m12 + jmp m(idct_16x16_internal_8bpc).end2 +ALIGN function_align +.main_pass1: + mova m4, [o(permB)] + psrlq m3, m4, 4 + vpermq m0, m4, [cq+64*0] + vpermq m7, m3, [cq+64*7] + vpermq m6, m4, [cq+64*6] + vpermq m1, m3, [cq+64*1] + vpermq m2, m4, [cq+64*2] + vpermq m5, m3, [cq+64*5] + vpermq m4, m4, [cq+64*4] + vpermq m3, m3, [cq+64*3] + call .main + vpbroadcastd m13, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + mova m2, m10 + vpdpwssd m2, m5, m13 ; -out5 + mova m8, m10 + vpdpwssd m8, m11, m13 ; out4 + mova m9, m10 + vpdpwssd m9, m5, m12 ; out10 + mova m5, m10 + vpdpwssd m5, m11, m12 ; -out11 + mova m11, m10 + vpdpwssd m11, m3, m13 ; -out7 + mova m14, m10 + vpdpwssd m14, m4, m13 ; out6 + mova m13, m10 + vpdpwssd m13, m3, m12 ; out8 + vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9 + REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10 + packssdw m2, m8 ; -out5 out4 + packssdw m5, m9, m5 ; out10 -out11 + packssdw m3, m11, m14 ; -out7 out6 + packssdw m4, m13, m10 ; out8 -out9 + ret +ALIGN function_align +.main_pass2: + vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc + vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 + vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec + vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 + vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me + vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 + vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee + vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 + vshufi32x4 m3, m1, m5, q3131 ; 6 7 + vshufi32x4 m1, m5, q2020 ; 2 3 + vshufi32x4 m5, m7, m9, q2020 ; 10 11 + vshufi32x4 m7, m9, q3131 ; 14 15 +cglobal_label .main_pass2b + REPX {pshufd x, x, q1032}, m1, m3, m5, m7 + call .main + vpbroadcastd m8, [o(pw_2896x8)] + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + punpcklqdq m9, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + shufps m3, m4, q3210 ; t3a t11 + psubsw m4, m2, m3 ; out8 -out9 + paddsw m3, m2 ; -out7 out6 + paddsw m2, m5, m9 ; -out5 out4 + psubsw m5, m9 ; out10 -out11 + REPX {pmulhrsw x, m8}, m2, m3, m4, m5 + ret +ALIGN function_align +.main: + vpbroadcastd m10, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + punpckhwd m8, m7, m0 ; in14 in1 + punpcklwd m0, m7 ; in0 in15 + punpcklwd m7, m6, m1 ; in12 in3 + punpckhwd m1, m6 ; in2 in13 + punpckhwd m6, m5, m2 ; in10 in5 + punpcklwd m2, m5 ; in4 in11 + punpcklwd m5, m4, m3 ; in8 in7 + punpckhwd m3, m4 ; in6 in9 + vpcmpub k7, m13, m10, 6 ; 0x33... + ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3 + ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5 + ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7 + ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9 + ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11 + ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13 + ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15 + psubsw m4, m0, m5 ; t9a t8a + paddsw m0, m5 ; t1a t0a + psubsw m5, m1, m6 ; t11a t10a + paddsw m1, m6 ; t3a t2a + psubsw m6, m2, m7 ; t13a t12a + paddsw m2, m7 ; t5a t4a + psubsw m7, m3, m8 ; t15a t14a + paddsw m3, m8 ; t7a t6a + ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9 + ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13 + ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11 + ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15 + psubsw m8, m1, m3 ; t7 t6 + paddsw m1, m3 ; t3 t2 + psubsw m3, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m2, m5, m7 ; t14a t15a + paddsw m7, m5 ; t10a t11a + psubsw m5, m4, m6 ; t12a t13a + paddsw m4, m6 ; t8a t9a + ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a + ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a + ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14 + ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12 + vbroadcasti32x4 m12, [o(deint_shuf)] + paddsw m6, m4, m7 ; -out1 out14 + psubsw m4, m7 ; t10 t11 + psubsw m11, m3, m8 ; t7 t6 + paddsw m8, m3 ; out12 -out3 + psubsw m3, m0, m1 ; t3a t2a + paddsw m0, m1 ; -out15 out0 + paddsw m1, m2, m5 ; -out13 out2 + psubsw m5, m2 ; t15a t14a + pshufb m0, m12 + pshufb m6, m12 + pshufb m8, m12 + pshufb m1, m12 + shufps m7, m6, m0, q1032 ; out14 -out15 + shufps m0, m6, m0, q3210 ; -out1 out0 + punpcklqdq m6, m8, m1 ; out12 -out13 + punpckhqdq m1, m8, m1 ; -out3 out2 + ret + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_16x16_internal_8bpc).main_pass1 + vpbroadcastd m10, [o(pw_m8192_8192)] + punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3 + punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3 + punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3 + punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3 + punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1 + punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3 + punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3 + punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3 + punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3 + punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 + jmp m(iadst_16x16_internal_8bpc).pass1_end +.pass2: + call m(iadst_16x16_internal_8bpc).main_pass2 + mova m10, [o(permD)] + psrlq m8, m10, 8 + psrlq m12, m10, 12 + psrlq m13, m10, 4 + mova m9, m8 + vpermi2q m8, m7, m5 ; 0 1 4 5 + vpermt2q m7, m12, m5 + vpermi2q m9, m6, m4 ; 2 3 6 7 + vpermt2q m6, m12, m4 + vpbroadcastd m12, [o(pw_2048)] + mov r3d, 0x00ff00ff + mova m11, m10 + vpermi2q m10, m3, m1 ; 8 9 12 13 + vpermt2q m3, m13, m1 + kmovd k1, r3d + vpermi2q m11, m2, m0 ; 10 11 14 15 + vpermt2q m2, m13, m0 + pxor m0, m0 + vpsubw m12{k1}, m0, m12 + pmulhrsw m0, m7, m12 + pmulhrsw m1, m6, m12 + pmulhrsw m4, m3, m12 + pmulhrsw m5, m2, m12 + jmp m(idct_16x16_internal_8bpc).end3 + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m8, [o(int16_perm)] + vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 + vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 + vpbroadcastd m0, [o(pw_1697x16)] + vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 + vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 + vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3 + vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3 + vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3 + vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3 + pmulhrsw m9, m0, m1 + pmulhrsw m10, m0, m2 + pmulhrsw m11, m0, m3 + pmulhrsw m12, m0, m4 + pmulhrsw m13, m0, m5 + pmulhrsw m14, m0, m6 + pmulhrsw m15, m0, m7 + pmulhrsw m0, m8 + REPX {psraw x, 1}, m9, m10, m11, m12 + pavgw m1, m9 + pavgw m2, m10 + pavgw m3, m11 + pavgw m4, m12 + REPX {psraw x, 1}, m13, m14, m15, m0 + pavgw m5, m13 + pavgw m6, m14 + pavgw m7, m15 + pavgw m8, m0 + punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 + punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 + jmp tx2q +ALIGN function_align +.pass2: + vpbroadcastd m11, [o(pw_1697x16)] + pmulhrsw m12, m11, m0 + pmulhrsw m13, m11, m1 + pmulhrsw m14, m11, m2 + pmulhrsw m15, m11, m3 + pmulhrsw m8, m11, m4 + pmulhrsw m9, m11, m5 + pmulhrsw m10, m11, m6 + pmulhrsw m11, m7 + REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + paddsw m0, m12 + paddsw m1, m13 + paddsw m2, m14 + paddsw m3, m15 + paddsw m8, m4 + movu m4, [o(permD+2)] + paddsw m9, m5 + paddsw m6, m10 + paddsw m7, m11 + psrlq m12, m4, 4 + mova m5, m4 + mova m10, m4 + mova m11, m4 + vpermi2q m4, m0, m2 ; 8 9 12 13 + vpermt2q m0, m12, m2 ; 0 1 4 5 + vpermi2q m5, m1, m3 ; 10 11 14 15 + vpermt2q m1, m12, m3 ; 2 3 6 7 + vpermi2q m10, m8, m6 + vpermt2q m8, m12, m6 + vpermi2q m11, m9, m7 + vpermt2q m9, m12, m7 + jmp m(idct_16x16_internal_8bpc).end + +%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4] + vpbroadcastd m%4, [o(pw_%5_%6x8)] + punpcklwd m%1, m%3, m%3 + pmulhrsw m%1, m%4 + vpbroadcastd m%4, [o(pw_%7_%8x8)] + punpckhwd m%2, m%3, m%3 + pmulhrsw m%2, m%4 +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + cmp eobd, 107 + jb .fast + mova m5, [cq+64*5] + mova m3, [cq+64*3] + mova m1, [cq+64*1] + mova m7, [cq+64*7] + mova m2, [cq+64*2] + mova m6, [cq+64*6] + mova m0, [cq+64*0] + mova m4, [cq+64*4] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + mova m8, [o(idct_8x32p)] + vpbroadcastd m9, [o(pw_8192)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7 + punpckldq m8, m0, m1 ; ab + punpckhdq m0, m1 + punpckldq m1, m2, m3 ; cd + punpckhdq m2, m3 + punpckldq m3, m4, m5 ; ef + punpckhdq m4, m5 + punpckldq m5, m6, m7 ; gh + punpckhdq m6, m7 + REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6 + punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9 + punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21 + punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13 + punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17 + punpcklqdq m20, m3, m5 + punpckhqdq m16, m3, m5 + punpcklqdq m19, m4, m6 + punpckhqdq m17, m4, m6 + vinserti32x4 ym8, ym18, xm20, 1 + vshufi32x4 ym1, ym18, ym20, 0x03 + vinserti32x4 ym9, ym14, xm16, 1 + vshufi32x4 ym3, ym14, ym16, 0x03 + vinserti32x4 ym0, ym21, xm19, 1 + vshufi32x4 ym5, ym21, ym19, 0x03 + vinserti32x4 ym7, ym15, xm17, 1 + vshufi32x4 ym6, ym15, ym17, 0x03 + call m(idct_8x16_internal_8bpc).main2 + psrlq m12, [o(permB)], 60 + vpermt2q m14, m12, m16 + vpermt2q m21, m12, m19 + vpermt2q m15, m12, m17 + vpermi2q m12, m18, m20 + vextracti32x8 ym16, m14, 1 + vextracti32x8 ym19, m21, 1 + vextracti32x8 ym17, m15, 1 + vextracti32x8 ym20, m12, 1 + call .main2 + jmp .end +.fast: ; right half is zero + mova m0, [o(int16_perm)] + mova ym2, [cq+64*4] + vinserti32x8 m2, [cq+64*0], 1 + mova ym3, [cq+64*6] + vinserti32x8 m3, [cq+64*2], 1 + mova ym4, [cq+64*3] + vinserti32x8 m4, [cq+64*5], 1 + mova ym5, [cq+64*7] + vinserti32x8 m5, [cq+64*1], 1 + REPX {vpermb x, m0, x}, m2, m3, m4, m5 + call m(idct_16x8_internal_8bpc).main2 + vbroadcasti32x4 m4, [o(int_shuf3)] + vbroadcasti32x4 m5, [o(int_shuf4)] + pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3 + pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3 + pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3 + pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3 + vpbroadcastd m4, [o(pw_8192)] + psrlq m5, [o(permB)], 60 + punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2 + punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3 + punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2 + punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3 + REPX {pmulhrsw x, m4}, m6, m17, m2, m16 + vinserti32x4 ym0, ym2, xm6, 1 ; 0 2 + vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 + vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 + vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 + vpermt2q m2, m5, m6 ; 8 10 + vpermt2q m16, m5, m17 ; 9 11 + vextracti32x8 ym3, m2, 1 ; 12 14 + vextracti32x8 ym17, m16, 1 ; 13 15 + call m(idct_8x16_internal_8bpc).main_fast + call .main_fast +.end: + vpbroadcastd ym8, strided + pmulld ym8, [o(gather8d)] + call .main_end + lea r3, [dstq+strideq*4] + kxnorb k1, k1, k1 + lea r4, [dstq+strideq*8] + pxor m9, m9 + lea r1, [r3+strideq*8] + kmovb k2, k1 + vpgatherdq m12{k1}, [r0+ym8] + kmovb k1, k2 + vpgatherdq m13{k2}, [r3+ym8] + kmovb k2, k1 + vpgatherdq m14{k1}, [r4+ym8] + kmovb k1, k2 + vpgatherdq m15{k2}, [r1+ym8] + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m11, m12, m9 + punpckhbw m12, m9 + paddw m0, m11 + paddw m1, m12 + packuswb m0, m1 + kmovb k2, k1 + vpscatterdq [r0+ym8]{k1}, m0 + punpcklbw m12, m13, m9 + punpckhbw m13, m9 + paddw m2, m12 + paddw m3, m13 + packuswb m2, m3 + kmovb k1, k2 + vpscatterdq [r3+ym8]{k2}, m2 + punpcklbw m13, m14, m9 + punpckhbw m14, m9 + paddw m4, m13 + paddw m5, m14 + packuswb m4, m5 + kmovb k2, k1 + vpscatterdq [r4+ym8]{k1}, m4 + punpcklbw m14, m15, m9 + punpckhbw m15, m9 + paddw m6, m14 + paddw m7, m15 + packuswb m6, m7 + vpscatterdq [r1+ym8]{k2}, m6 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 32 + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 +INIT_YMM avx512icl +ALIGN function_align +cglobal_label .main_fast2 ; bottom three-quarters are zero + ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + mova m11, m12 + mova m17, m20 + mova m15, m21 + mova m16, m14 + jmp .main4 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a + jmp .main3 +ALIGN function_align +cglobal_label .main + punpcklwd m12, m21, m14 ; in31 in1 + punpckhwd m14, m21 ; in3 in29 + punpcklwd m21, m20, m15 ; in27 in5 + punpckhwd m15, m20 ; in7 in25 + punpcklwd m20, m19, m16 ; in23 in9 + punpckhwd m16, m19 ; in11 in21 + punpcklwd m19, m18, m17 ; in19 in13 + punpckhwd m17, m18 ; in15 in17 +.main2: + ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a + ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a + ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a + ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a + ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a + ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a + ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a + ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a +.main3: + psubsw m11, m12, m17 ; t17 t30 + paddsw m12, m17 ; t16 t31 + psubsw m17, m15, m20 ; t18 t29 + paddsw m20, m15 ; t19 t28 + psubsw m15, m21, m16 ; t21 t26 + paddsw m21, m16 ; t20 t27 + psubsw m16, m14, m19 ; t22 t25 + paddsw m14, m19 ; t23 t24 +.main4: + ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a + ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a + ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a + ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a + vpbroadcastd m8, [o(pw_m3784_1567)] + psubsw m19, m12, m20 ; t19a t28a + paddsw m20, m12 ; t16a t31a + psubsw m12, m14, m21 ; t20a t27a + paddsw m14, m21 ; t23a t24a + psubsw m21, m11, m17 ; t18 t29 + paddsw m11, m17 ; t17 t30 + psubsw m17, m16, m15 ; t21 t26 + paddsw m16, m15 ; t22 t25 + ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a + ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28 + ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27 + ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a + vbroadcasti32x4 m18, [o(deint_shuf)] + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m9, [o(pw_2896_2896)] + psubsw m15, m20, m14 ; t23 t24 + paddsw m20, m14 ; t16 t31 + psubsw m14, m11, m16 ; t22a t25a + paddsw m11, m16 ; t17a t30a + psubsw m16, m21, m17 ; t21 t26 + paddsw m21, m17 ; t18 t29 + psubsw m17, m19, m12 ; t20a t27a + paddsw m19, m12 ; t19a t28a + REPX {pshufb x, m18}, m20, m11, m21, m19 + ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a + ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 + packssdw m18, m13 ; t23a t22 + packssdw m12, m15 ; t24a t25 + ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a + ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 + packssdw m16, m13 ; t20 t21a + packssdw m14, m15 ; t27 t26a + punpcklqdq m13, m19, m21 ; t19a t18 + punpckhqdq m19, m21 ; t28a t29 + punpcklqdq m21, m20, m11 ; t16 t17a + punpckhqdq m20, m11 ; t31 t30a +INIT_ZMM avx512icl + mova m15, [o(permA)] + ret +cglobal_label .main_end + vpbroadcastd m10, [o(pw_2048)] + vpermt2q m0, m15, m1 ; t0 t1 t2 t3 + vpermt2q m20, m15, m19 ; t31 t30a t29 t28a + vpermt2q m2, m15, m3 ; t4 t5 t6 t7 + vpermt2q m14, m15, m12 ; t27 t26a t25 t24a + vpermt2q m4, m15, m5 ; t8 t9 t10 t11 + vpermt2q m18, m15, m16 ; t23a t22 t21a t20 + vpermt2q m6, m15, m7 ; t12 t13 t14 t15 + vpermt2q m13, m15, m21 ; t19a t18 t17a t16 + psubsw m7, m0, m20 ; out31 out30 out29 out28 + paddsw m0, m20 ; out0 out1 out2 out3 + psubsw m5, m2, m14 ; out27 out26 out25 out24 + paddsw m2, m14 ; out4 out5 out6 out7 + psubsw m3, m4, m18 ; out23 out22 out21 out20 + paddsw m4, m18 ; out8 out9 out10 out11 + psubsw m1, m6, m13 ; out19 out18 out17 out16 + paddsw m6, m13 ; out12 out13 out14 out15 + vzeroupper + ret + +%macro LOAD_PACKED_16X2 3 ; dst, row[1-2] + vbroadcasti32x4 ym%1, [cq+16*%2] + vbroadcasti32x4 ym8, [cq+16*%3] + shufpd ym%1, ym8, 0x0c +%endmacro + +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob +%undef cmp + test eobd, eobd + jz .dconly + lea r5, [o_base] + LOAD_PACKED_16X2 0, 0, 2 ; in0 in2 + LOAD_PACKED_16X2 1, 4, 6 ; in4 in6 + LOAD_PACKED_16X2 2, 8, 10 ; in8 in10 + LOAD_PACKED_16X2 3, 12, 14 ; in12 in14 + LOAD_PACKED_16X2 14, 1, 3 ; in1 in3 + LOAD_PACKED_16X2 15, 5, 7 ; in5 in7 + LOAD_PACKED_16X2 16, 9, 11 ; in9 in11 + LOAD_PACKED_16X2 17, 13, 15 ; in13 in15 + pxor m4, m4 + REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 + cmp eobd, 107 + jb .fast + LOAD_PACKED_16X2 4, 16, 18 ; in16 in18 + LOAD_PACKED_16X2 5, 20, 22 ; in20 in22 + LOAD_PACKED_16X2 6, 24, 26 ; in24 in26 + LOAD_PACKED_16X2 7, 28, 30 ; in28 in30 + call m(idct_8x16_internal_8bpc).main + LOAD_PACKED_16X2 18, 19, 17 ; in19 in17 + LOAD_PACKED_16X2 19, 23, 21 ; in23 in21 + LOAD_PACKED_16X2 20, 27, 25 ; in27 in25 + LOAD_PACKED_16X2 21, 31, 29 ; in31 in29 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main + jmp .pass2 +.fast: ; bottom half is zero + mova ym5, ym4 + mova ym6, ym4 + mova ym7, ym4 + call m(idct_8x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast +.pass2: + vpbroadcastd m10, [o(pw_8192)] + vpermt2q m0, m15, m4 ; t0 t1 t9 t8 + vpermt2q m20, m15, m18 ; t31 t30a t23a t22 + vpermt2q m3, m15, m7 ; t7 t6 t14 t15 + vpermt2q m12, m15, m21 ; t25 t24a t17a t16 + vpermt2q m2, m15, m6 ; t4 t5 t13 t12 + vpermt2q m14, m15, m13 ; t23a t22 t21a t20 + vpermt2q m1, m15, m5 ; t3 t2 t10 t11 + vpermt2q m19, m15, m16 ; t27 t26a t19a t18 + psubsw m8, m0, m20 ; out31 out30 out22 out23 + paddsw m0, m20 ; out0 out1 out9 out8 + paddsw m6, m3, m12 ; out7 out6 out14 out15 + psubsw m3, m12 ; out24 out25 out17 out16 + psubsw m5, m2, m14 ; out27 out26 out18 out19 + paddsw m4, m2, m14 ; out4 out5 out13 out12 + psubsw m7, m1, m19 ; out28 out29 out21 out20 + paddsw m2, m1, m19 ; out3 out2 out10 out11 + vzeroupper + vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25 + vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24 + vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27 + vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26 + vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29 + vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28 + vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31 + vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 + call .main + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + lea r2, [strideq*3] + lea r3, [dstq+strideq*4] + movshdup m12, [o(permD)] + pmovzxbw m8, [dstq+strideq*0] + pmovzxbw m9, [dstq+strideq*1] + pmovzxbw m10, [dstq+strideq*2] + pmovzxbw m11, [dstq+r2 ] + paddw m0, m8 + paddw m1, m9 + paddw m2, m10 + paddw m3, m11 + pmovzxbw m8, [r3+strideq*0] + pmovzxbw m9, [r3+strideq*1] + pmovzxbw m10, [r3+strideq*2] + pmovzxbw m11, [r3+r2 ] + paddw m4, m8 + paddw m5, m9 + paddw m6, m10 + paddw m7, m11 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m12, m0 + vpermq m2, m12, m2 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym2 + vextracti32x8 [dstq+r2 ], m2, 1 + packuswb m4, m5 + packuswb m6, m7 + vpermq m4, m12, m4 + vpermq m6, m12, m6 + mova [r3+strideq*0], ym4 + vextracti32x8 [r3+strideq*1], m4, 1 + mova [r3+strideq*2], ym6 + vextracti32x8 [r3+r2 ], m6, 1 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 8 +.dconly2: + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 +.dconly3: + imul r6d, 181 + add r6d, 128+2048 + sar r6d, 8+4 + pxor m2, m2 + vpbroadcastw m3, r6d +.dconly_loop: + mova ym1, [dstq+strideq*0] + vinserti32x8 m1, [dstq+strideq*1], 1 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + paddw m0, m3 + paddw m1, m3 + packuswb m0, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(pd_2048)] +.main2: + ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a + ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a + ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3 + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0 +.main3: + paddsw m8, m1, m5 ; t4 + psubsw m1, m5 ; t5a + paddsw m9, m7, m3 ; t7 + psubsw m7, m3 ; t6a + ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6 + psubsw m5, m0, m2 ; dct4 out2 + paddsw m2, m0 ; dct4 out1 + paddsw m0, m4, m6 ; dct4 out0 + psubsw m4, m6 ; dct4 out3 + psubsw m6, m2, m1 ; out6 + paddsw m1, m2 ; out1 + paddsw m2, m5, m7 ; out2 + psubsw m5, m7 ; out5 + psubsw m7, m0, m9 ; out7 + paddsw m0, m9 ; out0 + paddsw m3, m4, m8 ; out3 + psubsw m4, m8 ; out4 + ret + +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c + vpbroadcastd m7, [pw_5] + paddsw m0, m7, [cq+64*0] + paddsw m1, m7, [cq+64*1] + vpbroadcastd ym9, strided + paddsw m2, m7, [cq+64*2] + paddsw m3, m7, [cq+64*3] + paddsw m4, m7, [cq+64*4] + paddsw m5, m7, [cq+64*5] + paddsw m6, m7, [cq+64*6] + paddsw m7, [cq+64*7] + pmulld ym14, ym9, [pd_0to15] + lea r3, [dstq+strideq*1] + lea r4, [dstq+strideq*2] + kxnorb k1, k1, k1 + pxor m13, m13 + add r1, r4 ; dstq+strideq*3 + kmovb k2, k1 + vpgatherdq m9{k1}, [r0+ym14*4] + kmovb k1, k2 + vpgatherdq m10{k2}, [r3+ym14*4] + kmovb k2, k1 + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 + REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + vpgatherdq m11{k1}, [r4+ym14*4] + kmovb k1, k2 + vpgatherdq m12{k2}, [r1+ym14*4] + REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m8, m9, m13 ; 0 8 16 24 + punpckhbw m9, m13 ; 4 12 20 28 + paddw m0, m8 + paddw m4, m9 + packuswb m0, m4 + kmovb k2, k1 + vpscatterdq [r0+ym14*4]{k1}, m0 + punpcklbw m8, m10, m13 ; 1 9 17 25 + punpckhbw m10, m13 ; 5 13 21 29 + paddw m1, m8 + paddw m5, m10 + packuswb m1, m5 + kmovb k1, k2 + vpscatterdq [r3+ym14*4]{k2}, m1 + punpcklbw m8, m11, m13 ; 2 10 18 26 + punpckhbw m11, m13 ; 6 14 22 30 + paddw m2, m8 + paddw m6, m11 + packuswb m2, m6 + kmovb k2, k1 + vpscatterdq [r4+ym14*4]{k1}, m2 + punpcklbw m8, m12, m13 ; 3 11 19 27 + punpckhbw m12, m13 ; 7 15 23 31 + paddw m3, m8 + paddw m7, m12 + packuswb m3, m7 + vpscatterdq [r1+ym14*4]{k2}, m3 + RET + +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c + vpbroadcastd m0, [pw_4096] + pmulhrsw m3, m0, [cq+64*0] + pmulhrsw m4, m0, [cq+64*4] + pmulhrsw m6, m0, [cq+64*1] + pmulhrsw m5, m0, [cq+64*5] + pmulhrsw m7, m0, [cq+64*2] + pmulhrsw m2, m0, [cq+64*6] + pmulhrsw m8, m0, [cq+64*3] + pmulhrsw m0, [cq+64*7] + mova m13, [int8_permA] + lea r3, [strideq*3] + lea r4, [dstq+strideq*4] + punpckldq m1, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m6, m5 + punpckhdq m6, m5 + punpckldq m5, m7, m2 + punpckhdq m7, m2 + punpckldq m2, m8, m0 + punpckhdq m8, m0 + mova ym9, [dstq+strideq*0] + vinserti32x8 m9, [dstq+strideq*2], 1 + mova ym10, [dstq+strideq*1] + vinserti32x8 m10, [dstq+r3 ], 1 + mova ym11, [r4+strideq*0] + vinserti32x8 m11, [r4+strideq*2], 1 + mova ym12, [r4+strideq*1] + vinserti32x8 m12, [r4+r3 ], 1 + REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8 + pxor m13, m13 + REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklqdq m0, m1, m4 ; a0 a2 c0 c2 + punpckhqdq m1, m4 ; b0 b2 d0 d2 + punpcklqdq m4, m5, m2 ; a1 a3 c1 c3 + punpckhqdq m5, m2 ; b1 b3 d1 d3 + punpcklqdq m2, m3, m6 ; e0 e2 g0 g2 + punpckhqdq m3, m6 ; f0 f2 h0 h2 + punpcklqdq m6, m7, m8 ; e1 e3 g1 g3 + punpckhqdq m7, m8 ; f1 f3 h1 h3 + punpcklbw m8, m9, m13 + punpckhbw m9, m13 + paddw m0, m8 + paddw m4, m9 + packuswb m0, m4 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*2], m0, 1 + punpcklbw m8, m10, m13 + punpckhbw m10, m13 + paddw m1, m8 + paddw m5, m10 + packuswb m1, m5 + mova [dstq+strideq*1], ym1 + vextracti32x8 [dstq+r3 ], m1, 1 + punpcklbw m8, m11, m13 + punpckhbw m11, m13 + paddw m2, m8 + paddw m6, m11 + packuswb m2, m6 + mova [r4+strideq*0], ym2 + vextracti32x8 [r4+strideq*2], m2, 1 + punpcklbw m8, m12, m13 + punpckhbw m12, m13 + paddw m3, m8 + paddw m7, m12 + packuswb m3, m7 + mova [r4+strideq*1], ym3 + vextracti32x8 [r4+r3 ], m3, 1 + RET + +%macro IDCT_16x32_END 3 ; src[1-2], row + mova xm8, [dstq+strideq*0] + vinserti32x4 ym8, [dstq+strideq*1], 1 + mova xm9, [dstq+r3 ] + vinserti32x4 ym9, [dstq+strideq*2], 1 + pmulhrsw m%1, m10 + pmulhrsw m%2, m10 + vpermb m8, m11, m8 + vpermb m9, m11, m9 + mova [cq+64*(%3*2+0)], m13 + mova [cq+64*(%3*2+1)], m13 + paddw m8, m%1 + paddw m9, m%2 + packuswb m8, m9 + vpermd m8, m12, m8 + mova [dstq+strideq*0], xm8 + vextracti32x4 [dstq+strideq*1], ym8, 1 + vextracti32x4 [dstq+strideq*2], m8, 2 + vextracti32x4 [dstq+r3 ], m8, 3 +%if %1 != 20 + lea dstq, [dstq+strideq*4] +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m15, [o(pw_2896x8)] + cmp eobd, 151 + jb .fast + pmulhrsw m5, m15, [cq+64*10] + pmulhrsw m3, m15, [cq+64* 6] + pmulhrsw m1, m15, [cq+64* 2] + pmulhrsw m7, m15, [cq+64*14] + pmulhrsw m2, m15, [cq+64* 4] + pmulhrsw m6, m15, [cq+64*12] + pmulhrsw m0, m15, [cq+64* 0] + pmulhrsw m4, m15, [cq+64* 8] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + pmulhrsw m14, m15, [cq+64* 1] + pmulhrsw m21, m15, [cq+64*15] + pmulhrsw m18, m15, [cq+64* 9] + pmulhrsw m17, m15, [cq+64* 7] + pmulhrsw m16, m15, [cq+64* 5] + pmulhrsw m19, m15, [cq+64*11] + pmulhrsw m20, m15, [cq+64*13] + pmulhrsw m15, [cq+64* 3] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova m8, [o(idct_16x32p)] + vpbroadcastd m9, [o(pw_16384)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m14, m15, m16, m17, m18, m19, m20, m21 + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m1, m2, m3 + punpckhdq m2, m3 + REPX {pmulhrsw x, m9}, m8, m0, m1, m2 + punpckldq m3, m4, m5 + punpckhdq m4, m5 + punpckldq m5, m6, m7 + punpckhdq m6, m7 + REPX {pmulhrsw x, m9}, m3, m4, m5, m6 + punpckldq m7, m14, m15 + punpckhdq m14, m15 + punpckldq m15, m16, m17 + punpckhdq m16, m17 + REPX {pmulhrsw x, m9}, m7, m14, m15, m16 + punpckldq m17, m18, m19 + punpckhdq m18, m19 + punpckldq m19, m20, m21 + punpckhdq m20, m21 + REPX {pmulhrsw x, m9}, m17, m18, m19, m20 + punpcklqdq m21, m8, m1 + punpckhqdq m8, m1 + punpcklqdq m1, m0, m2 + punpckhqdq m0, m2 + punpcklqdq m2, m3, m5 + punpckhqdq m3, m5 + punpcklqdq m5, m4, m6 + punpckhqdq m4, m6 + punpcklqdq m6, m7, m15 + punpckhqdq m7, m15 + punpcklqdq m15, m14, m16 + punpckhqdq m14, m16 + punpcklqdq m16, m17, m19 + punpckhqdq m17, m19 + punpcklqdq m19, m18, m20 + punpckhqdq m18, m20 + vinserti32x8 m20, m21, ym2, 1 + vshufi32x4 m21, m2, q3232 + vinserti32x8 m2, m8, ym3, 1 + vshufi32x4 m8, m3, q3232 + vinserti32x8 m3, m1, ym5, 1 + vshufi32x4 m1, m5, q3232 + vinserti32x8 m5, m0, ym4, 1 + vshufi32x4 m0, m4, q3232 + vinserti32x8 m4, m6, ym16, 1 + vshufi32x4 m6, m16, q3232 + vinserti32x8 m16, m7, ym17, 1 + vshufi32x4 m7, m17, q3232 + vinserti32x8 m17, m15, ym19, 1 + vshufi32x4 m15, m19, q3232 + vinserti32x8 m19, m14, ym18, 1 + vshufi32x4 m14, m18, q3232 + vshufi32x4 m18, m21, m6, q3131 ; 27 5 + vshufi32x4 m21, m6, q2020 ; 31 1 + vshufi32x4 m6, m8, m7, q2020 ; 24 8 + vshufi32x4 m8, m7, q3131 ; 30 2 + vshufi32x4 m7, m1, m15, q2020 ; 28 4 + vshufi32x4 m1, m15, q3131 ; 6 26 + vshufi32x4 m15, m0, m14, q2020 ; 7 25 + vshufi32x4 m0, m14, q3131 ; 14 18 + vshufi32x4 m14, m20, m4, q2020 ; 3 29 + vshufi32x4 m20, m4, q3131 ; 23 9 + vshufi32x4 m9, m3, m17, q2020 ; 16 0 + vshufi32x4 m3, m17, q3131 ; 12 20 + vshufi32x4 m17, m5, m19, q2020 ; 15 17 + vshufi32x4 m5, m19, q3131 ; 22 10 + vshufi32x4 m19, m2, m16, q2020 ; 19 13 + vshufi32x4 m16, m2, m16, q3131 ; 11 21 + call m(idct_16x16_internal_8bpc).main3 + call .main_oddhalf + jmp .pass2 +.fast: ; right half is zero + mova ym8, [cq+64*15] + vinserti32x8 m8, [cq+64* 1], 1 + mova m2, [o(int16_perm)] + mova ym9, [cq+64* 8] + vinserti32x8 m9, [cq+64* 0], 1 + mova ym0, [cq+64* 7] + vinserti32x8 m0, [cq+64* 9], 1 + mova ym7, [cq+64*14] + vinserti32x8 m7, [cq+64* 2], 1 + mova ym1, [cq+64* 3] + vinserti32x8 m1, [cq+64*13], 1 + mova ym3, [cq+64* 6] + vinserti32x8 m3, [cq+64*10], 1 + mova ym5, [cq+64*11] + vinserti32x8 m5, [cq+64* 5], 1 + mova ym6, [cq+64*12] + vinserti32x8 m6, [cq+64* 4], 1 + REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6 + REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 + call m(idct_16x16_internal_8bpc).main2 + vbroadcasti32x4 m8, [o(int_shuf3)] + vbroadcasti32x4 m9, [o(int_shuf4)] + vpbroadcastd m11, [o(pw_16384)] + pshufb m0, m8 + pshufb m1, m9 + pshufb m2, m8 + pshufb m3, m9 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + pshufb m4, m8 + pshufb m5, m9 + pshufb m6, m8 + pshufb m7, m9 + REPX {pmulhrsw x, m11}, m4, m5, m6, m7 + punpckhdq m17, m0, m1 + punpckldq m0, m1 + punpckhdq m16, m2, m3 + punpckldq m2, m3 + punpckhdq m18, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m6, m7 + punpckldq m6, m7 + vinserti32x8 m1, m0, ym2, 1 + vshufi32x4 m3, m0, m2, q3232 + vinserti32x8 m2, m4, ym6, 1 + vshufi32x4 m4, m6, q3232 + vinserti32x8 m15, m17, ym16, 1 + vshufi32x4 m17, m16, q3232 + vinserti32x8 m16, m18, ym5, 1 + vshufi32x4 m18, m5, q3232 + vshufi32x4 m0, m1, m2, q2020 ; 0 2 + vshufi32x4 m1, m2, q3131 ; 4 6 + vshufi32x4 m2, m3, m4, q2020 ; 8 10 + vshufi32x4 m3, m4, q3131 ; 12 14 + vshufi32x4 m14, m15, m16, q2020 ; 1 3 + vshufi32x4 m15, m16, q3131 ; 5 7 + vshufi32x4 m16, m17, m18, q2020 ; 9 11 + vshufi32x4 m17, m18, q3131 ; 13 15 + pxor m6, m6 + punpckhwd m8, m0, m0 + punpcklwd m9, m6, m0 + punpckhwd m0, m3, m3 + punpckhwd m5, m2, m2 + punpcklwd m7, m1, m1 + punpckhwd m1, m1 + punpcklwd m3, m3 + punpcklwd m6, m2 + call m(idct_16x16_internal_8bpc).main_fast5 + punpcklwd m21, m14, m14 + punpckhwd m14, m14 + punpcklwd m18, m15, m15 + punpckhwd m15, m15 + punpcklwd m20, m16, m16 + punpckhwd m16, m16 + punpcklwd m19, m17, m17 + punpckhwd m17, m17 + call .main_oddhalf_fast +.pass2: + vpbroadcastd m10, [o(pw_2048)] + mova m11, [o(end_16x32p)] + lea r3, [strideq*3] + pxor m13, m13 + psrld m12, m11, 8 + IDCT_16x32_END 0, 1, 0 + IDCT_16x32_END 2, 3, 1 + IDCT_16x32_END 4, 5, 2 + IDCT_16x32_END 6, 7, 3 + IDCT_16x32_END 14, 15, 4 + IDCT_16x32_END 16, 17, 5 + IDCT_16x32_END 18, 19, 6 + IDCT_16x32_END 20, 21, 7 + RET +ALIGN function_align +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly +ALIGN function_align +cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero + vpbroadcastd m8, [o(pw_201_4091x8)] + vpbroadcastd m20, [o(pw_m1380_3857x8)] + vpbroadcastd m9, [o(pw_995_3973x8)] + vpbroadcastd m16, [o(pw_m601_4052x8)] + pmulhrsw m21, m8 ; t16a, t31a + pmulhrsw m20, m15 ; t19a, t28a + pmulhrsw m18, m9 ; t20a, t27a + pmulhrsw m14, m16 ; t23a, t24a + mova m8, m21 + mova m17, m20 + mova m15, m18 + mova m16, m14 + jmp .main3 +ALIGN function_align +cglobal_label .main_oddhalf_fast ; bottom half is zero + vpbroadcastd m8, [o(pw_201_4091x8)] + vpbroadcastd m9, [o(pw_m2751_3035x8)] + vpbroadcastd m11, [o(pw_1751_3703x8)] + vpbroadcastd m12, [o(pw_m1380_3857x8)] + pmulhrsw m21, m8 ; t16a, t31a + vpbroadcastd m8, [o(pw_995_3973x8)] + pmulhrsw m17, m9 ; t17a, t30a + vpbroadcastd m9, [o(pw_m2106_3513x8)] + pmulhrsw m20, m11 ; t18a, t29a + vpbroadcastd m11, [o(pw_2440_3290x8)] + pmulhrsw m15, m12 ; t19a, t28a + vpbroadcastd m12, [o(pw_m601_4052x8)] + pmulhrsw m18, m8 ; t20a, t27a + pmulhrsw m16, m9 ; t21a, t26a + pmulhrsw m19, m11 ; t22a, t25a + pmulhrsw m14, m12 ; t23a, t24a + jmp .main2 +ALIGN function_align +cglobal_label .main_oddhalf + ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a + ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a + ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a + ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a + ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a + ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a + ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a + ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a +.main2: + psubsw m8, m21, m17 ; t17 t30 + paddsw m21, m17 ; t16 t31 + psubsw m17, m15, m20 ; t18 t29 + paddsw m20, m15 ; t19 t28 + psubsw m15, m18, m16 ; t21 t26 + paddsw m18, m16 ; t20 t27 + psubsw m16, m14, m19 ; t22 t25 + paddsw m14, m19 ; t23 t24 +.main3: + ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a + ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a + ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a + ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a + vpbroadcastd m11, [o(pw_m3784_1567)] + psubsw m19, m21, m20 ; t19a t28a + paddsw m21, m20 ; t16a t31a + psubsw m20, m14, m18 ; t20a t27a + paddsw m14, m18 ; t23a t24a + psubsw m18, m8, m17 ; t18 t29 + paddsw m8, m17 ; t17 t30 + psubsw m17, m16, m15 ; t21 t26 + paddsw m15, m16 ; t22 t25 + ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a + ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28 + ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27 + ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a + vbroadcasti32x4 m9, [o(deint_shuf)] + psubsw m16, m21, m14 ; t23 t24 + paddsw m14, m21 ; t16 t31 + psubsw m21, m8, m15 ; t22a t25a + paddsw m15, m8 ; t17a t30a + psubsw m8, m18, m17 ; t21 t26 + paddsw m18, m17 ; t18 t29 + paddsw m17, m19, m20 ; t19a t28a + psubsw m19, m20 ; t20a t27a + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + REPX {pshufb x, m9}, m14, m15, m18, m17 + mova m9, m10 + vpdpwssd m9, m16, m11 + mova m20, m10 + vpdpwssd m20, m21, m11 + psrad m9, 12 + psrad m20, 12 + packssdw m9, m20 ; t23a t22 + mova m20, m10 + vpdpwssd m20, m16, m12 + mova m16, m10 + vpdpwssd m16, m21, m12 + psrad m20, 12 + psrad m16, 12 + packssdw m16, m20, m16 ; t24a t25 + ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a + ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 + packssdw m11, m20 ; t27 t26a + packssdw m8, m21 ; t20 t21a + punpcklqdq m20, m14, m15 ; t16 t17a + punpckhqdq m14, m15 ; t31 t30a + punpckhqdq m15, m17, m18 ; t28a t29 + punpcklqdq m17, m18 ; t19a t18 + psubsw m21, m0, m14 ; out31 out30 + paddsw m0, m14 ; out0 out1 + psubsw m14, m7, m20 ; out16 out17 + paddsw m7, m20 ; out15 out14 + psubsw m20, m1, m15 ; out28 out29 + paddsw m1, m15 ; out3 out2 + psubsw m15, m6, m17 ; out19 out18 + paddsw m6, m17 ; out12 out13 + psubsw m17, m4, m9 ; out23 out22 + paddsw m4, m9 ; out8 out9 + psubsw m18, m3, m16 ; out24 out25 + paddsw m3, m16 ; out7 out6 + psubsw m16, m5, m8 ; out20 out21 + paddsw m5, m8 ; out11 out10 + psubsw m19, m2, m11 ; out27 out26 + paddsw m2, m11 ; out4 out5 + ret + +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + mova m21, [o(permB)] + vpermq m1, m21, [cq+64* 0] ; 0 1 + vpermq m14, m21, [cq+64* 1] ; 2 3 + vpermq m20, m21, [cq+64* 2] ; 4 5 + vpermq m15, m21, [cq+64* 3] ; 6 7 + vpbroadcastd m8, [o(pw_2896x8)] + vpermq m2, m21, [cq+64* 4] ; 8 9 + vpermq m16, m21, [cq+64* 5] ; 10 11 + vpermq m3, m21, [cq+64* 6] ; 12 13 + vpermq m17, m21, [cq+64* 7] ; 14 15 + REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17 + pxor m12, m12 + REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7 + cmp eobd, 151 + jb .fast + vpermq m9, m21, [cq+64* 8] ; 16 17 + vpermq m19, m21, [cq+64* 9] ; 18 19 + vpermq m4, m21, [cq+64*10] ; 20 21 + vpermq m5, m21, [cq+64*11] ; 22 23 + vpermq m6, m21, [cq+64*12] ; 24 25 + vpermq m18, m21, [cq+64*13] ; 26 27 + vpermq m7, m21, [cq+64*14] ; 28 29 + vpermq m21, m21, [cq+64*15] ; 30 31 + REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21 + REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15 + punpcklwd m8, m21, m14 ; 30 2 + punpckhwd m21, m1 ; 31 1 + punpcklwd m0, m17, m19 ; 14 18 + punpckhwd m17, m9 ; 15 17 + punpcklwd m9, m1 ; 16 0 + punpckhwd m14, m7 ; 3 29 + punpcklwd m1, m15, m18 ; 6 26 + punpckhwd m15, m6 ; 7 25 + punpcklwd m6, m2 ; 24 8 + punpckhwd m19, m3 ; 19 13 + punpcklwd m3, m4 ; 12 20 + punpckhwd m18, m20 ; 27 5 + punpcklwd m7, m20 ; 28 4 + punpckhwd m20, m5, m2 ; 23 9 + punpcklwd m5, m16 ; 22 10 + punpckhwd m16, m4 ; 11 21 + call m(idct_16x16_internal_8bpc).main2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + jmp .pass2 +.fast: ; bottom half zero + punpcklwd m8, m14, m14 ; 2 + punpcklwd m0, m17, m17 ; 14 + punpcklwd m5, m16, m16 ; 10 + punpcklwd m9, m12, m1 ; __ 0 + punpckhwd m21, m1, m1 ; 1 + punpcklwd m1, m15, m15 ; 6 + punpcklwd m7, m20, m20 ; 4 + punpckhwd m19, m3, m3 ; 13 + punpcklwd m3, m3 ; 12 + punpcklwd m6, m12, m2 ; __ 8 + punpckhwd m18, m20, m20 ; 5 + punpckhwd m20, m2, m2 ; 9 + call m(idct_16x16_internal_8bpc).main_fast + punpckhwd m15, m15 ; 7 + punpckhwd m14, m14 ; 3 + punpckhwd m16, m16 ; 11 + punpckhwd m17, m17 ; 15 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast +.pass2: + vpbroadcastd m9, [o(pw_16384)] + call .transpose_round + vshufi32x4 m16, m14, m2, q3131 ; 5 + vshufi32x4 m14, m2, q2020 ; 1 + vshufi32x4 m2, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m1, m18, q3131 ; 6 + vshufi32x4 m1, m18, q2020 ; 2 + vshufi32x4 m18, m20, m6, q2020 ; 9 + vshufi32x4 m20, m6, q3131 ; 13 + vshufi32x4 m6, m21, m4, q3131 ; 12 + vshufi32x4 m4, m21, m4, q2020 ; 8 + vshufi32x4 m21, m19, m7, q3131 ; 15 + vshufi32x4 m19, m7, q2020 ; 11 + vshufi32x4 m7, m5, m15, q3131 ; 14 + vshufi32x4 m5, m15, q2020 ; 10 + vshufi32x4 m15, m17, m9, q2020 ; 3 + vshufi32x4 m17, m9, q3131 ; 7 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 + call .main_oddhalf + vpbroadcastd m12, [o(pw_2048)] + movshdup m13, [o(permD)] + lea r2, [strideq*3] + pmovzxbw m8, [dstq+strideq*0] + pmovzxbw m9, [dstq+strideq*1] + pmovzxbw m10, [dstq+strideq*2] + pmovzxbw m11, [dstq+r2 ] + REPX {pmulhrsw x, m12}, m0, m1, m2, m3 + lea r3, [dstq+strideq*4] + paddw m0, m8 + paddw m1, m9 + paddw m2, m10 + paddw m3, m11 + pmovzxbw m8, [r3+strideq*0] + pmovzxbw m9, [r3+strideq*1] + pmovzxbw m10, [r3+strideq*2] + pmovzxbw m11, [r3+r2 ] + REPX {pmulhrsw x, m12}, m4, m5, m6, m7 + lea r4, [dstq+strideq*8] + packuswb m0, m1 + paddw m4, m8 + paddw m5, m9 + packuswb m2, m3 + paddw m6, m10 + paddw m7, m11 + pmovzxbw m8, [r4+strideq*0] + pmovzxbw m9, [r4+strideq*1] + pmovzxbw m10, [r4+strideq*2] + pmovzxbw m11, [r4+r2 ] + REPX {pmulhrsw x, m12}, m14, m15, m16, m17 + lea r5, [r3+strideq*8] + packuswb m4, m5 + paddw m14, m8 + paddw m15, m9 + packuswb m6, m7 + paddw m16, m10 + paddw m17, m11 + pmovzxbw m8, [r5+strideq*0] + pmovzxbw m9, [r5+strideq*1] + pmovzxbw m10, [r5+strideq*2] + pmovzxbw m11, [r5+r2 ] + REPX {pmulhrsw x, m12}, m18, m19, m20, m21 + packuswb m14, m15 + paddw m18, m8 + paddw m19, m9 + packuswb m16, m17 + paddw m20, m10 + paddw m21, m11 + packuswb m18, m19 + packuswb m20, m21 + REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym2 + vextracti32x8 [dstq+r2 ], m2, 1 + mova [r3+strideq*0], ym4 + vextracti32x8 [r3+strideq*1], m4, 1 + mova [r3+strideq*2], ym6 + vextracti32x8 [r3+r2 ], m6, 1 + mova [r4+strideq*0], ym14 + vextracti32x8 [r4+strideq*1], m14, 1 + mova [r4+strideq*2], ym16 + vextracti32x8 [r4+r2 ], m16, 1 + mova [r5+strideq*0], ym18 + vextracti32x8 [r5+strideq*1], m18, 1 + mova [r5+strideq*2], ym20 + vextracti32x8 [r5+r2 ], m20, 1 + RET +ALIGN function_align +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 16 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 +ALIGN function_align +cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m2, [o(pw_4017x8)] + vpbroadcastd m3, [o(pw_799x8)] + vpbroadcastd m18, [o(pw_4076x8)] + vpbroadcastd m19, [o(pw_401x8)] + vpbroadcastd m20, [o(pw_m1189x8)] + vpbroadcastd m16, [o(pw_3920x8)] + pmulhrsw m9, m0 ; t0 + pmulhrsw m2, m1 ; t7a + pmulhrsw m1, m3 ; t4a + pmulhrsw m18, m14 ; t15a + pmulhrsw m14, m19 ; t8a + pmulhrsw m20, m15 ; t11a + pmulhrsw m15, m16 ; t12a + psubsw m7, m9, m2 ; idct8 out7 + paddsw m0, m9, m2 ; idct8 out0 + psubsw m4, m9, m1 ; idct8 out4 + paddsw m3, m9, m1 ; idct8 out3 + ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6 + mova m21, m18 + mova m19, m14 + mova m16, m15 + mova m8, m20 + psubsw m6, m9, m1 ; idct8 out6 + paddsw m1, m9 ; idct8 out1 + psubsw m5, m9, m2 ; idct8 out5 + paddsw m2, m9 ; idct8 out2 + jmp .main3 +ALIGN function_align +cglobal_label .main_oddhalf_fast ; bottom half is zero + vpbroadcastd m5, [o(pw_m2276x8)] + vpbroadcastd m11, [o(pw_3406x8)] + vpbroadcastd m7, [o(pw_4017x8)] + vpbroadcastd m12, [o(pw_799x8)] + vpbroadcastd m6, [o(pw_3784x8)] + vpbroadcastd m10, [o(pw_1567x8)] + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m5, m3 ; t5a + pmulhrsw m3, m11 ; t6a + pmulhrsw m7, m1 ; t7a + pmulhrsw m1, m12 ; t4a + pmulhrsw m6, m2 ; t3 + pmulhrsw m2, m10 ; t2 + pmulhrsw m4, m0 ; t0 + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + vpbroadcastd m10, [o(pd_2048)] + mova m0, m4 ; t1 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main3 + vpbroadcastd m21, [o(pw_4076x8)] + vpbroadcastd m8, [o(pw_401x8)] + vpbroadcastd m18, [o(pw_m2598x8)] + vpbroadcastd m9, [o(pw_3166x8)] + vpbroadcastd m19, [o(pw_3612x8)] + vpbroadcastd m11, [o(pw_1931x8)] + vpbroadcastd m20, [o(pw_m1189x8)] + vpbroadcastd m12, [o(pw_3920x8)] + pmulhrsw m21, m14 ; t15a + pmulhrsw m14, m8 ; t8a + pmulhrsw m18, m17 ; t9a + pmulhrsw m17, m9 ; t14a + pmulhrsw m19, m16 ; t13a + pmulhrsw m16, m11 ; t10a + pmulhrsw m20, m15 ; t11a + pmulhrsw m15, m12 ; t12a + jmp .main2 +ALIGN function_align +cglobal_label .main_oddhalf + ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a + ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a +.main2: + paddsw m8, m20, m16 ; t11 + psubsw m20, m16 ; t10 + paddsw m16, m15, m19 ; t12 + psubsw m15, m19 ; t13 + psubsw m19, m14, m18 ; t9 + paddsw m14, m18 ; t8 + psubsw m18, m21, m17 ; t14 + paddsw m21, m17 ; t15 +.main3: + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a + vpbroadcastd m11, [o(pw_m1567_m3784)] + ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + psubsw m17, m14, m8 ; t11a + paddsw m8, m14 ; t8a + paddsw m14, m18, m15 ; t9 + psubsw m18, m15 ; t10 + psubsw m15, m19, m20 ; t13 + paddsw m19, m20 ; t14 + paddsw m20, m21, m16 ; t15a + psubsw m16, m21, m16 ; t12a + ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a + ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12 + psubsw m21, m0, m20 ; out15 + paddsw m0, m20 ; out0 + psubsw m20, m1, m19 ; out14 + paddsw m1, m19 ; out1 + psubsw m19, m2, m18 ; out13 + paddsw m2, m18 ; out2 + psubsw m18, m3, m17 ; out12 + paddsw m3, m17 ; out3 + psubsw m17, m4, m16 ; out11 + paddsw m4, m16 ; out4 + psubsw m16, m5, m15 ; out10 + paddsw m5, m15 ; out5 + psubsw m15, m6, m14 ; out9 + paddsw m6, m14 ; out6 + psubsw m14, m7, m8 ; out8 + paddsw m7, m8 ; out7 + ret +.transpose_round: + punpcklwd m8, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + punpcklwd m7, m14, m16 + punpckhwd m14, m16 + punpcklwd m16, m15, m17 + punpckhwd m15, m17 + punpcklwd m17, m19, m21 + punpckhwd m19, m21 + punpckhwd m21, m18, m20 + punpcklwd m18, m20 + punpcklwd m20, m8, m1 + punpckhwd m8, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + REPX {pmulhrsw x, m9}, m20, m8, m1, m0 + punpcklwd m6, m7, m15 + punpckhwd m7, m15 + punpcklwd m15, m14, m16 + punpckhwd m14, m16 + REPX {pmulhrsw x, m9}, m2, m3, m5, m4 + punpckhwd m16, m18, m19 + punpcklwd m18, m19 + punpcklwd m19, m21, m17 + punpckhwd m21, m17 + REPX {pmulhrsw x, m9}, m6, m7, m15, m14 + punpcklwd m17, m8, m0 ; a2 a6 aa ae + punpckhwd m8, m0 ; a3 a7 ab af + punpcklwd m0, m20, m1 ; a0 a4 a8 ac + punpckhwd m20, m1 ; a1 a5 a9 ad + REPX {pmulhrsw x, m9}, m16, m18, m19, m21 + punpcklwd m1, m2, m5 ; b0 b4 b8 bc + punpckhwd m2, m5 ; b1 b5 b9 bd + punpcklwd m5, m3, m4 ; b2 b6 ba be + punpckhwd m3, m4 ; b3 b7 bb bf + punpcklwd m4, m6, m15 ; c0 c4 c8 cc + punpckhwd m6, m15 ; c1 c5 c9 cd + punpcklwd m15, m7, m14 ; c2 c6 ca ce + punpckhwd m7, m14 ; c3 c7 cb cf + punpcklwd m14, m18, m19 ; d0 d4 d8 dc + punpckhwd m18, m19 ; d1 d5 d9 dd + punpcklwd m9, m16, m21 ; d2 d6 da de + punpckhwd m16, m21 ; d3 d7 db df + vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc + vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 + vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 + vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be + vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 + vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf + vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 + vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc + vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 + vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd + vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 + vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd + vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 + vshufi32x4 m15, m9, q3232 ; ca ce da de + vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 + vshufi32x4 m7, m16, q3232 ; cb cf db df + ret + +%macro IDTX_16x32 4 ; src/dst[1-4] + pmulhrsw m%1, m15, [cq+64*%1] + pmulhrsw m%2, m15, [cq+64*%2] + pmulhrsw m%3, m15, [cq+64*%3] + pmulhrsw m%4, m15, [cq+64*%4] + pmulhrsw m18, m16, m%1 + pmulhrsw m19, m16, m%2 + pmulhrsw m20, m16, m%3 + pmulhrsw m21, m16, m%4 + REPX {pmulhrsw x, m17}, m18, m19, m20, m21 + paddsw m%1, m18 + paddsw m%2, m19 + paddsw m%3, m20 + paddsw m%4, m21 +%endmacro + +%macro IDTX_16x32_STORE 2 ; src[1-2] + mova xm17, [dstq+r3*0] + vinserti128 ym17, [dstq+r3*4], 1 + vinserti32x4 m17, [dstq+r3*8], 2 + vinserti32x4 m17, [dstq+r4*8], 3 + mova [cq+64*(%1*2+0)], m18 + mova [cq+64*(%1*2+1)], m18 + punpcklbw m16, m17, m18 + punpckhbw m17, m18 + paddw m16, m%1 + paddw m17, m%2 + packuswb m16, m17 + mova [dstq+r3*0], xm16 + vextracti128 [dstq+r3*4], ym16, 1 + vextracti32x4 [dstq+r3*8], m16, 2 + vextracti32x4 [dstq+r4*8], m16, 3 +%if %1 != 7 + add dstq, strideq +%endif +%endmacro + +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c + vpbroadcastd m15, [pw_2896x8] + vpbroadcastd m16, [pw_1697x16] + vpbroadcastd m17, [pw_16384] + IDTX_16x32 0, 1, 2, 3 + IDTX_16x32 4, 5, 6, 7 + IDTX_16x32 8, 9, 10, 11 + IDTX_16x32 12, 13, 14, 15 + vpbroadcastd m16, [pw_8192] + call .transpose_2x8x8_round + lea r3, [strideq*2] + lea r4, [strideq*3] + pxor m18, m18 + IDTX_16x32_STORE 0, 8 + IDTX_16x32_STORE 1, 9 + IDTX_16x32_STORE 2, 10 + IDTX_16x32_STORE 3, 11 + IDTX_16x32_STORE 4, 12 + IDTX_16x32_STORE 5, 13 + IDTX_16x32_STORE 6, 14 + IDTX_16x32_STORE 7, 15 + RET +ALIGN function_align +.transpose_2x8x8_round: + punpckhwd m17, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m17, m1 + punpckhdq m17, m1 + REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m17 + punpcklqdq m6, m17 + punpckhwd m17, m12, m13 + punpcklwd m12, m13 + punpckhwd m13, m8, m9 + punpcklwd m8, m9 + punpckhwd m9, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m10, m11 + punpcklwd m10, m11 + punpckhdq m11, m8, m10 + punpckldq m8, m10 + punpckldq m10, m12, m14 + punpckhdq m12, m14 + punpckhdq m14, m13, m15 + punpckldq m13, m15 + punpckldq m15, m17, m9 + punpckhdq m17, m9 + REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17 + punpckhqdq m9, m8, m10 + punpcklqdq m8, m10 + punpcklqdq m10, m11, m12 + punpckhqdq m11, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m14, m17 + punpcklqdq m14, m17 + ret + +%macro IDTX_32x16 4 ; dst[1-4] + pmulhrsw m%2, m12, [cq+32*(%1+ 0)] + pmulhrsw m18, m12, [cq+32*(%1+16)] + pmulhrsw m%4, m12, [cq+32*(%3+ 0)] + pmulhrsw m19, m12, [cq+32*(%3+16)] + REPX {paddsw x, x}, m%2, m18, m%4, m19 + mova m%1, m14 + vpermi2q m%1, m%2, m18 + vpermt2q m%2, m16, m18 +%if %3 != 14 + mova m%3, m14 +%endif + vpermi2q m%3, m%4, m19 + vpermt2q m%4, m16, m19 + pmulhrsw m18, m17, m%1 + pmulhrsw m19, m17, m%2 + pmulhrsw m20, m17, m%3 + pmulhrsw m21, m17, m%4 + REPX {paddsw x, x}, m%1, m%2, m%3, m%4 + paddsw m%1, m18 + paddsw m%2, m19 + paddsw m%3, m20 + paddsw m%4, m21 +%endmacro + +%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32 + mova ym19, [dstq+strideq*0] + vinserti32x8 m19, [dstq+strideq*8], 1 +%if %3 == 0 + mova [cq+64*(%1*2+0)], m20 + mova [cq+64*(%1*2+1)], m20 +%endif + punpcklbw m18, m19, m20 + punpckhbw m19, m20 + paddw m18, m%1 + paddw m19, m%2 + packuswb m18, m19 + mova [dstq+strideq*0], ym18 + vextracti32x8 [dstq+strideq*8], m18, 1 +%if %3 || %1 != 7 + add dstq, strideq +%endif +%endmacro + +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c + vpbroadcastd m12, [pw_2896x8] + movu m14, [permB+7] + vpbroadcastd m17, [pw_1697x16] + psrlq m16, m14, 4 + IDTX_32x16 0, 1, 2, 3 + IDTX_32x16 4, 5, 6, 7 + IDTX_32x16 8, 9, 10, 11 + IDTX_32x16 12, 13, 14, 15 + vpbroadcastd m16, [pw_2048] + call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round + pxor m20, m20 + IDTX_32x16_STORE 0, 8 + IDTX_32x16_STORE 1, 9 + IDTX_32x16_STORE 2, 10 + IDTX_32x16_STORE 3, 11 + IDTX_32x16_STORE 4, 12 + IDTX_32x16_STORE 5, 13 + IDTX_32x16_STORE 6, 14 + IDTX_32x16_STORE 7, 15 + RET + +%macro IDCT_32x32_END 4 ; src, mem, stride[1-2] + pmovzxbw m10, [dstq+%3] + pmovzxbw m11, [r3 +%4] +%if %2 < 8 + paddsw m8, m%2, m%1 + psubsw m9, m%2, m%1 +%else + mova m9, [cq+64*(%2*2-16)] + paddsw m8, m9, m%1 + psubsw m9, m%1 +%endif + pmulhrsw m8, m12 + pmulhrsw m9, m12 +%if %2 >= 8 +%if %2 == 8 + pxor m0, m0 +%endif + mova [cq+64*(%2*2-16)], m0 + mova [cq+64*(%2*2-15)], m0 +%endif + paddw m8, m10 + paddw m9, m11 + packuswb m8, m9 + vpermq m8, m13, m8 + mova [dstq+%3], ym8 + vextracti32x8 [r3 +%4], m8, 1 +%if %2 == 3 || %2 == 7 || %2 == 11 + add dstq, r5 + sub r3, r5 +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + WIN64_SPILL_XMM 30 + cmp eobd, 136 + jb .fast + mova m5, [cq+64*20] + mova m3, [cq+64*12] + mova m1, [cq+64* 4] + mova m7, [cq+64*28] + mova m2, [cq+64* 8] + mova m6, [cq+64*24] + mova m0, [cq+64* 0] + mova m4, [cq+64*16] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + mova m14, [cq+64* 2] + mova m21, [cq+64*30] + mova m18, [cq+64*18] + mova m17, [cq+64*14] + mova m16, [cq+64*10] + mova m19, [cq+64*22] + mova m20, [cq+64*26] + mova m15, [cq+64* 6] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + mova m22, [cq+64* 1] + mova m21, [cq+64*31] + mova m14, [cq+64*17] + mova m29, [cq+64*15] + mova m26, [cq+64* 9] + mova m17, [cq+64*23] + mova m18, [cq+64*25] + mova m25, [cq+64* 7] + mova m24, [cq+64* 5] + mova m19, [cq+64*27] + mova m16, [cq+64*21] + mova m27, [cq+64*11] + mova m28, [cq+64*13] + mova m15, [cq+64*19] + mova m20, [cq+64*29] + mova m23, [cq+64* 3] + call .main_oddhalf + vpbroadcastd m10, [o(pw_8192)] + psubsw m13, m0, m29 ; 31 + paddsw m0, m29 ; 0 + psubsw m29, m1, m28 ; 30 + paddsw m1, m28 ; 1 + psubsw m28, m2, m27 ; 29 + paddsw m2, m27 ; 2 + psubsw m27, m3, m26 ; 28 + paddsw m3, m26 ; 3 + psubsw m26, m4, m25 ; 27 + paddsw m4, m25 ; 4 + psubsw m25, m5, m24 ; 26 + paddsw m5, m24 ; 5 + psubsw m24, m6, m23 ; 25 + paddsw m6, m23 ; 6 + psubsw m23, m7, m22 ; 24 + paddsw m7, m22 ; 7 + pxor m9, m9 + punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 + REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 + punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 + punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 + punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 + punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 + REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 + punpckhwd m3, m23, m24 + punpcklwd m23, m24 + punpckhwd m24, m25, m26 + punpcklwd m25, m26 + REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 + punpckhwd m26, m27, m28 + punpcklwd m27, m28 + punpckhwd m28, m29, m13 + punpcklwd m29, m13 + REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 + punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 + punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 + punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 + REPX {pmulhrsw x, m10}, m0, m4, m8, m22 + punpckhdq m13, m23, m25 + punpckldq m23, m25 + punpckhdq m25, m27, m29 + punpckldq m27, m29 + REPX {pmulhrsw x, m10}, m13, m23, m25, m27 + punpckhdq m9, m3, m24 + punpckldq m3, m24 + punpckhdq m24, m26, m28 + punpckldq m26, m28 + punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 + punpckhqdq m23, m27 ; d01 d09 d17 d25 + punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 + punpcklqdq m13, m25 ; d02 d10 d18 d26 + punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 + punpcklqdq m3, m26 ; d04 d12 d20 d28 + punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 + punpcklqdq m9, m24 ; d06 d14 d22 d30 + REPX {pmulhrsw x, m10}, m25, m3, m26 + mova [cq+64* 9], m23 + mova [cq+64*11], m27 + mova [cq+64*13], m25 + mova [cq+64*15], m26 + punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 + punpcklqdq m8, m22 ; a04 a12 a20 a28 + punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 + punpcklqdq m0, m4 ; a00 a08 a16 a24 + punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 + punpcklqdq m7, m2 ; a02 a10 a18 a26 + punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 + punpcklqdq m6, m1 ; a06 a14 a22 a30 + mova m2, [cq+64* 0] + mova m11, [cq+64* 2] + mova m12, [cq+64* 4] + mova m29, [cq+64* 6] + mova m27, [cq+64* 8] + mova m26, [cq+64*10] + mova m4, [cq+64*12] + mova m28, [cq+64*14] + psubsw m1, m2, m21 ; 23 + paddsw m2, m21 ; 8 + psubsw m21, m11, m20 ; 22 + paddsw m11, m20 ; 9 + psubsw m20, m12, m19 ; 21 + paddsw m12, m19 ; 10 + psubsw m19, m29, m18 ; 20 + paddsw m29, m18 ; 11 + psubsw m18, m27, m17 ; 19 + paddsw m27, m17 ; 12 + psubsw m17, m26, m16 ; 18 + paddsw m26, m16 ; 13 + paddsw m16, m4, m15 ; 14 + psubsw m4, m15 ; 17 + pmulhrsw m15, m6, m10 + psubsw m6, m28, m14 ; 16 + paddsw m28, m14 ; 15 + pmulhrsw m14, m7, m10 + punpcklwd m7, m6, m4 + punpckhwd m6, m4 + punpckhwd m4, m17, m18 + punpcklwd m17, m18 + punpckhwd m18, m19, m20 + punpcklwd m19, m20 + punpckhwd m20, m21, m1 + punpcklwd m21, m1 + punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 + punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 + punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 + punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 + punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 + punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 + punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 + punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 + pmulhrsw m23, m10 + pmulhrsw m25, m10 + punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 + REPX {pmulhrsw x, m10}, m28, m2, m12, m27 + punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 + punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 + punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 + punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 + REPX {pmulhrsw x, m10}, m16, m1, m11, m29 + punpckhdq m26, m19, m21 + punpckldq m19, m21 + punpckhdq m21, m6, m4 + punpckldq m6, m4 + REPX {pmulhrsw x, m10}, m26, m19, m21, m6 + punpckhdq m4, m18, m20 + punpckldq m18, m20 + punpckhdq m20, m7, m17 + punpckldq m7, m17 + REPX {pmulhrsw x, m10}, m4, m18, m20, m7 + punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 + punpckhqdq m28, m12 ; b03 b11 b19 b27 + punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 + punpcklqdq m2, m27 ; b00 b08 b16 b24 + punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 + punpcklqdq m1, m29 ; b04 b12 b20 b28 + punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 + punpcklqdq m16, m11 ; b06 b14 b22 b30 + mova [cq+64* 1], m12 + mova [cq+64* 3], m28 + mova [cq+64* 5], m27 + mova [cq+64* 7], m29 + punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 + punpcklqdq m20, m26 ; c02 c10 c18 c26 + punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 + punpcklqdq m7, m19 ; c00 c08 c16 c24 + punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 + punpcklqdq m6, m18 ; c04 c12 c20 c28 + punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 + punpcklqdq m21, m4 ; c06 c14 c22 c30 + pmulhrsw m19, m9, m10 + vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 + vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 + vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 + vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 + vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 + vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 + vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 + vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 + vshufi32x4 m3, m1, m6, q3131 ; 12 + vshufi32x4 m1, m6, q2020 ; 4 + vshufi32x4 m6, m4, m2, q3131 ; 24 + vshufi32x4 m4, m2, q2020 ; 16 + vshufi32x4 m2, m0, m7, q3131 ; 8 + vshufi32x4 m0, m7, q2020 ; 0 + vshufi32x4 m7, m5, m8, q3131 ; 28 + vshufi32x4 m5, m8, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 + vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 + vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 + vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 + vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30 + vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14 + vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 + vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 + vshufi32x4 m16, m14, m20, q3131 ; 10 + vshufi32x4 m14, m20, q2020 ; 2 + vshufi32x4 m20, m18, m17, q3131 ; 26 + vshufi32x4 m18, m17, q2020 ; 18 + vshufi32x4 m17, m15, m21, q3131 ; 14 + vshufi32x4 m15, m21, q2020 ; 6 + vshufi32x4 m21, m19, m13, q3131 ; 30 + vshufi32x4 m19, m13, q2020 ; 22 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + mova m15, [cq+64* 1] + mova m16, [cq+64* 3] + mova m17, [cq+64* 5] + mova m19, [cq+64* 7] + mova m20, [cq+64* 9] + mova m21, [cq+64*11] + mova m13, [cq+64*13] + mova m18, [cq+64*15] + vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 + vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 + vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 + vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 + vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 + vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 + vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 + vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 + vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 + vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 + vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 + vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 + vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 + vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 + vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 + vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 + vshufi32x4 m18, m14, m26, q3131 ; 25 + vshufi32x4 m14, m26, q2020 ; 17 + vshufi32x4 m19, m15, m27, q3131 ; 27 + vshufi32x4 m15, m27, q2020 ; 19 + vshufi32x4 m20, m16, m28, q3131 ; 29 + vshufi32x4 m16, m28, q2020 ; 21 + vshufi32x4 m21, m17, m29, q3131 ; 31 + vshufi32x4 m17, m29, q2020 ; 23 + vshufi32x4 m26, m22, m8, q3131 ; 9 + vshufi32x4 m22, m8, q2020 ; 1 + vshufi32x4 m27, m23, m9, q3131 ; 11 + vshufi32x4 m23, m9, q2020 ; 3 + vshufi32x4 m28, m24, m11, q3131 ; 13 + vshufi32x4 m24, m11, q2020 ; 5 + vshufi32x4 m29, m25, m12, q3131 ; 15 + vshufi32x4 m25, m12, q2020 ; 7 + call .main_oddhalf + jmp .end +.fast: ; bottom/right halves are zero + mova m14, [o(dup16_perm)] + pmovzxwd m9, [cq+64* 0] + pmovzxwd m6, [cq+64* 8] + vpermb m8, m14, [cq+64* 2] + vpermb ym0, ym14, [cq+64*14] + vpermb ym5, ym14, [cq+64*10] + vpermb m1, m14, [cq+64* 6] + vpermb m7, m14, [cq+64* 4] + vpermb ym3, ym14, [cq+64*12] + pslld m9, 16 + pslld m6, 16 + call m(idct_16x16_internal_8bpc).main_fast + vpermb m21, m14, [cq+64* 1] + vpermb ym17, ym14, [cq+64*15] + vpermb ym20, ym14, [cq+64* 9] + vpermb m15, m14, [cq+64* 7] + vpermb m18, m14, [cq+64* 5] + vpermb ym16, ym14, [cq+64*11] + vpermb ym19, ym14, [cq+64*13] + vpermb m14, m14, [cq+64* 3] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m9, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round + vshufi32x4 m22, m14, m2, q2020 ; 1 + vshufi32x4 m24, m14, m2, q3131 ; 5 + vshufi32x4 m23, m17, m9, q2020 ; 3 + vshufi32x4 m25, m17, m9, q3131 ; 7 + vshufi32x4 m16, m5, m15, q2020 ; 10 + vshufi32x4 m17, m5, m15, q3131 ; 14 + vshufi32x4 m14, m1, m18, q2020 ; 2 + vshufi32x4 m15, m1, m18, q3131 ; 6 + vshufi32x4 m1, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m21, m4, q3131 ; 12 + vshufi32x4 m2, m21, m4, q2020 ; 8 + vshufi32x4 m26, m20, m6, q2020 ; 9 + vshufi32x4 m28, m20, m6, q3131 ; 13 + vshufi32x4 m27, m19, m7, q2020 ; 11 + vshufi32x4 m29, m19, m7, q3131 ; 15 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + call .main_oddhalf_fast +.end: + lea r4, [strideq*3] + vpbroadcastd m12, [o(pw_2048)] + movshdup m13, [o(permD)] + lea r3, [dstq+r4*8] + lea r5, [strideq+r4] ; stride*4 + add r3, r5 ; dst+stride*28 + IDCT_32x32_END 29, 0, strideq*0, r4 + IDCT_32x32_END 28, 1, strideq*1, strideq*2 + IDCT_32x32_END 27, 2, strideq*2, strideq*1 + IDCT_32x32_END 26, 3, r4 , strideq*0 + IDCT_32x32_END 25, 4, strideq*0, r4 + IDCT_32x32_END 24, 5, strideq*1, strideq*2 + IDCT_32x32_END 23, 6, strideq*2, strideq*1 + IDCT_32x32_END 22, 7, r4 , strideq*0 + IDCT_32x32_END 21, 8, strideq*0, r4 + IDCT_32x32_END 20, 9, strideq*1, strideq*2 + IDCT_32x32_END 19, 10, strideq*2, strideq*1 + IDCT_32x32_END 18, 11, r4 , strideq*0 + IDCT_32x32_END 17, 12, strideq*0, r4 + IDCT_32x32_END 16, 13, strideq*1, strideq*2 + IDCT_32x32_END 15, 14, strideq*2, strideq*1 + IDCT_32x32_END 14, 15, r4 , strideq*0 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 +ALIGN function_align +cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero + vpbroadcastd m21, [o(pw_4091x8)] + vpbroadcastd m8, [o(pw_201x8)] + vpbroadcastd m18, [o(pw_m1380x8)] + vpbroadcastd m9, [o(pw_3857x8)] + vpbroadcastd m19, [o(pw_3973x8)] + vpbroadcastd m11, [o(pw_995x8)] + vpbroadcastd m28, [o(pw_m601x8)] + vpbroadcastd m12, [o(pw_4052x8)] + pmulhrsw m21, m22 ; t31a + pmulhrsw m22, m8 ; t16a + pmulhrsw m18, m25 ; t19a + pmulhrsw m25, m9 ; t28a + pmulhrsw m19, m24 ; t27a + pmulhrsw m24, m11 ; t20a + pmulhrsw m28, m23 ; t23a + pmulhrsw m23, m12 ; t24a + mova m15, m21 + mova m8, m22 + mova m14, m18 + mova m27, m25 + mova m29, m19 + mova m26, m24 + mova m16, m28 + mova m20, m23 + jmp .main3 +ALIGN function_align +cglobal_label .main_oddhalf_fast ; bottom half is zero + vpbroadcastd m21, [o(pw_4091x8)] + vpbroadcastd m8, [o(pw_201x8)] + vpbroadcastd m14, [o(pw_m2751x8)] + vpbroadcastd m9, [o(pw_3035x8)] + vpbroadcastd m17, [o(pw_3703x8)] + vpbroadcastd m11, [o(pw_1751x8)] + vpbroadcastd m18, [o(pw_m1380x8)] + vpbroadcastd m12, [o(pw_3857x8)] + pmulhrsw m21, m22 ; t31a + vpbroadcastd m19, [o(pw_3973x8)] + pmulhrsw m22, m8 ; t16a + vpbroadcastd m8, [o(pw_995x8)] + pmulhrsw m14, m29 ; t30a + vpbroadcastd m16, [o(pw_m2106x8)] + pmulhrsw m29, m9 ; t17a + vpbroadcastd m9, [o(pw_3513x8)] + pmulhrsw m17, m26 ; t29a + vpbroadcastd m15, [o(pw_3290x8)] + pmulhrsw m26, m11 ; t18a + vpbroadcastd m11, [o(pw_2440x8)] + pmulhrsw m18, m25 ; t19a + vpbroadcastd m20, [o(pw_m601x8)] + pmulhrsw m25, m12 ; t28a + vpbroadcastd m12, [o(pw_4052x8)] + pmulhrsw m19, m24 ; t27a + pmulhrsw m24, m8 ; t20a + pmulhrsw m16, m27 ; t21a + pmulhrsw m27, m9 ; t26a + pmulhrsw m15, m28 ; t25a + pmulhrsw m28, m11 ; t22a + pmulhrsw m20, m23 ; t23a + pmulhrsw m23, m12 ; t24a + jmp .main2 +ALIGN function_align +cglobal_label .main_oddhalf + ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a + ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a + ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a + ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a + ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a +.main2: + psubsw m8, m22, m14 ; t17 + paddsw m22, m14 ; t16 + paddsw m14, m18, m26 ; t19 + psubsw m18, m26 ; t18 + psubsw m26, m24, m16 ; t21 + paddsw m24, m16 ; t20 + psubsw m16, m20, m28 ; t22 + paddsw m28, m20 ; t23 + psubsw m20, m23, m15 ; t25 + paddsw m23, m15 ; t24 + psubsw m15, m21, m29 ; t30 + paddsw m21, m29 ; t31 + psubsw m29, m19, m27 ; t26 + paddsw m19, m27 ; t27 + paddsw m27, m25, m17 ; t28 + psubsw m25, m17 ; t29 +.main3: + ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a + ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a + ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a + ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m11, [o(pw_1567_3784)] + psubsw m17, m21, m27 ; t28a + paddsw m21, m27 ; t31a + psubsw m27, m15, m25 ; t18 + paddsw m15, m25 ; t17 + psubsw m25, m20, m29 ; t21 + paddsw m20, m29 ; t22 + psubsw m29, m8, m18 ; t29 + paddsw m8, m18 ; t30 + psubsw m18, m22, m14 ; t19a + paddsw m22, m14 ; t16a + psubsw m14, m28, m24 ; t20a + paddsw m24, m28 ; t23a + paddsw m28, m16, m26 ; t25 + psubsw m16, m26 ; t26 + psubsw m26, m23, m19 ; t27a + paddsw m23, m19 ; t24a + ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a + ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 + vpbroadcastd m11, [o(pw_m1567_m3784)] + ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a + ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 + vpbroadcastd m12, [o(pw_m2896_2896)] + vpbroadcastd m11, [o(pw_2896_2896)] + psubsw m19, m27, m25 ; t26 + paddsw m27, m25 ; t29 + psubsw m25, m17, m26 ; t20a + paddsw m17, m26 ; t19a + paddsw m26, m18, m14 ; t28a + psubsw m18, m14 ; t27a + paddsw m14, m22, m24 ; t16 + psubsw m22, m24 ; t23 + psubsw m24, m29, m16 ; t21 + paddsw m16, m29 ; t18 + paddsw m29, m21, m23 ; t31 + psubsw m21, m23 ; t24 + psubsw m23, m15, m20 ; t22a + paddsw m15, m20 ; t17a + psubsw m20, m8, m28 ; t25a + paddsw m28, m8 ; t30a + ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 + ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a + ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a + ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 + ret + +%macro IDTX_32x32 2 ; dst[1-2] + vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which + vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to + vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements + vmovdqa32 ym18, [cq+64*(%2+16)] + vpermt2q m%1, m21, m17 + vpermt2q m%2, m21, m18 +%endmacro + +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c + movu m21, [permB+7] + vpbroadcastd m16, [pw_8192] + pxor m20, m20 +.loop: + IDTX_32x32 0, 1 + IDTX_32x32 2, 3 + IDTX_32x32 4, 5 + IDTX_32x32 6, 7 + IDTX_32x32 8, 9 + IDTX_32x32 10, 11 + IDTX_32x32 12, 13 + IDTX_32x32 14, 15 + call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round + IDTX_32x16_STORE 0, 8, 1 + IDTX_32x16_STORE 1, 9, 1 + IDTX_32x16_STORE 2, 10, 1 + IDTX_32x16_STORE 3, 11, 1 + IDTX_32x16_STORE 4, 12, 1 + IDTX_32x16_STORE 5, 13, 1 + IDTX_32x16_STORE 6, 14, 1 + IDTX_32x16_STORE 7, 15, 1 + lea dstq, [dstq+strideq*8] + btc cq, 5 + jnc .loop + mov r0d, 8 +.zero_loop: + mova [cq+64*0], m20 + mova [cq+64*1], m20 + mova [cq+64*2], m20 + mova [cq+64*3], m20 + add cq, 64*4 + dec r0d + jg .zero_loop + RET + +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + WIN64_SPILL_XMM 30 + cmp eobd, 151 + jb .fast + mova m5, [cq+64*10] + mova m3, [cq+64* 6] + mova m1, [cq+64* 2] + mova m7, [cq+64*14] + mova m2, [cq+64* 4] + mova m6, [cq+64*12] + mova m0, [cq+64* 0] + mova m4, [cq+64* 8] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + mova m14, [cq+64* 1] + mova m21, [cq+64*15] + mova m18, [cq+64* 9] + mova m17, [cq+64* 7] + mova m16, [cq+64* 5] + mova m19, [cq+64*11] + mova m20, [cq+64*13] + mova m15, [cq+64* 3] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + vpbroadcastd m9, [o(pw_8192)] +%macro TRANSPOSE_8x4_ROUND 4 + punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3 + punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7 + REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4 +%endmacro + TRANSPOSE_8x4_ROUND 0, 1, 2, 3 + TRANSPOSE_8x4_ROUND 4, 5, 6, 7 + TRANSPOSE_8x4_ROUND 14, 15, 16, 17 + TRANSPOSE_8x4_ROUND 18, 19, 20, 21 + vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4 + vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12 + vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5 + vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13 + vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6 + vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14 + vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7 + vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15 + vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4 + vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12 + vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5 + vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13 + vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6 + vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14 + vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7 + vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15 + vshufi32x4 m22, m26, m4, q2020 ; 0 1 + vshufi32x4 m26, m4, q3131 ; 8 9 + vshufi32x4 m23, m27, m5, q2020 ; 2 3 + vshufi32x4 m27, m5, q3131 ; 10 11 + vshufi32x4 m24, m28, m6, q2020 ; 4 5 + vshufi32x4 m28, m6, q3131 ; 12 13 + vshufi32x4 m25, m29, m7, q2020 ; 6 7 + vshufi32x4 m29, m7, q3131 ; 14 15 + vshufi32x4 m4, m0, m14, q2020 ; 16 17 + vshufi32x4 m3, m0, m14, q3131 ; 24 25 + vshufi32x4 m20, m1, m15, q2020 ; 18 19 + vshufi32x4 m19, m1, m15, q3131 ; 26 27 + vshufi32x4 m5, m2, m16, q2020 ; 20 21 + vshufi32x4 m0, m2, m16, q3131 ; 28 29 + vshufi32x4 m16, m8, m17, q2020 ; 22 23 + vshufi32x4 m17, m8, m17, q3131 ; 30 31 + pxor m6, m6 + mova [cq+64* 0], m4 + mova [cq+64* 2], m5 + mova [cq+64* 4], m3 + mova [cq+64* 6], m0 + punpcklwd m8, m24, m24 ; 4 + punpcklwd m0, m0 ; 28 + punpcklwd m5, m5 ; 20 + punpcklwd m1, m28, m28 ; 12 + punpcklwd m7, m26, m26 ; 8 + punpcklwd m3, m3 ; 24 + punpcklwd m9, m6, m22 ; __ 0 + punpcklwd m6, m4 ; __ 16 + call m(idct_16x16_internal_8bpc).main_fast3 + mova [cq+64* 1], m20 + mova [cq+64* 3], m16 + mova [cq+64* 5], m19 + mova [cq+64* 7], m17 + punpcklwd m21, m23, m23 ; 2 + punpcklwd m17, m17 ; 30 + punpcklwd m20, m20 ; 18 + punpcklwd m15, m29, m29 ; 14 + punpcklwd m18, m27, m27 ; 10 + punpcklwd m16, m16 ; 22 + punpcklwd m19, m19 ; 26 + punpcklwd m14, m25, m25 ; 6 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + mova m21, [cq+64* 7] + mova m14, [cq+64* 0] + mova m17, [cq+64* 3] + mova m18, [cq+64* 4] + mova m19, [cq+64* 5] + mova m16, [cq+64* 2] + mova m15, [cq+64* 1] + mova m20, [cq+64* 6] + REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ + m24, m19, m16, m27, m28, m15, m20, m23 + call .main_oddhalf + jmp .end +.fast: ; right half is zero + mova ym8, [cq+64*15] + vinserti32x8 m8, [cq+64* 1], 1 + mova m2, [o(int16_perm)] + mova ym9, [cq+64* 8] + vinserti32x8 m9, [cq+64* 0], 1 + mova ym0, [cq+64* 7] + vinserti32x8 m0, [cq+64* 9], 1 + mova ym7, [cq+64*14] + vinserti32x8 m7, [cq+64* 2], 1 + mova ym1, [cq+64* 3] + vinserti32x8 m1, [cq+64*13], 1 + mova ym3, [cq+64* 6] + vinserti32x8 m3, [cq+64*10], 1 + mova ym5, [cq+64*11] + vinserti32x8 m5, [cq+64* 5], 1 + mova ym6, [cq+64*12] + vinserti32x8 m6, [cq+64* 4], 1 + REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 + call m(idct_16x16_internal_8bpc).main2 + vbroadcasti32x4 m8, [o(int_shuf3)] + vbroadcasti32x4 m9, [o(int_shuf4)] + vpbroadcastd m11, [o(pw_8192)] + pshufb m0, m8 + pshufb m1, m9 + pshufb m2, m8 + pshufb m3, m9 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + pshufb m4, m8 + pshufb m5, m9 + pshufb m6, m8 + pshufb m7, m9 + REPX {pmulhrsw x, m11}, m4, m5, m6, m7 + punpckhdq m28, m0, m1 + punpckldq m0, m1 + punpckhdq m27, m2, m3 + punpckldq m2, m3 + punpckhdq m22, m4, m5 + punpckldq m4, m5 + punpckhdq m23, m6, m7 + punpckldq m6, m7 + vinserti32x8 m14, m0, ym2, 1 + vshufi32x4 m15, m0, m2, q3232 + vinserti32x8 m2, m4, ym6, 1 + vshufi32x4 m4, m6, q3232 + vshufi32x4 m21, m14, m2, q2020 ; 0 2 + vshufi32x4 m14, m2, q3131 ; 4 6 + vshufi32x4 m18, m15, m4, q2020 ; 8 10 + vshufi32x4 m15, m4, q3131 ; 12 14 + pxor m9, m9 + punpcklwd m8, m14, m14 ; 4 + punpcklwd m1, m15, m15 ; 12 + punpcklwd m7, m18, m18 ; 8 + punpcklwd m9, m21 ; __ 0 + call m(idct_16x16_internal_8bpc).main_fast4 + punpckhwd m21, m21 ; 2 + punpckhwd m15, m15 ; 14 + punpckhwd m18, m18 ; 10 + punpckhwd m14, m14 ; 6 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + vinserti32x8 m24, m28, ym27, 1 + vshufi32x4 m28, m27, q3232 + vinserti32x8 m27, m22, ym23, 1 + vshufi32x4 m22, m23, q3232 + vshufi32x4 m23, m24, m27, q2020 ; 1 3 + vshufi32x4 m24, m27, q3131 ; 5 7 + vshufi32x4 m27, m28, m22, q2020 ; 9 11 + vshufi32x4 m28, m22, q3131 ; 13 15 + punpcklwd m22, m23, m23 ; 1 + punpckhwd m29, m28, m28 ; 15 + punpcklwd m26, m27, m27 ; 9 + punpckhwd m25, m24, m24 ; 7 + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + punpcklwd m24, m24 ; 5 + punpckhwd m27, m27 ; 11 + punpcklwd m28, m28 ; 13 + punpckhwd m23, m23 ; 3 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + call .main_oddhalf_fast +.end: + imul r6, strideq, 60 + mova m10, [o(end_16x32p)] + vpbroadcastd m11, [o(pw_2048)] + lea r3, [strideq*3] + pxor m12, m12 + add r6, dstq ; dst+stride*60 + psrldq m13, m10, 1 + lea r4, [strideq+r3] ; stride*4 +%macro IDCT_16x64_END 3 ; idct32, idct64, tmp +%if %1 & 1 + %define %%s0 r3 + %define %%s1 strideq*2 + %define %%s2 strideq*1 + %define %%s3 strideq*0 +%else + %define %%s0 strideq*0 + %define %%s1 strideq*1 + %define %%s2 strideq*2 + %define %%s3 r3 +%if %1 + add dstq, r4 + sub r6, r4 +%endif +%endif +%if %1 < 8 + pmulhrsw m8, m11, m%1 + pmulhrsw m9, m11, m%2 +%else + mova m9, [cq+64*%1] + paddsw m8, m9, m%2 ; out 0+n, 1+n + psubsw m9, m%2 ; out 63-n, 62-n + pmulhrsw m8, m11 + pmulhrsw m9, m11 +%endif + mova xm29, [dstq+%%s0] + vinserti128 ym29, [dstq+%%s1], 1 + mova xm%3, [r6 +%%s3] + vinserti128 ym%3, [r6 +%%s2], 1 + vpermb m29, m10, m29 + vpermb m%3, m10, m%3 + mova [cq+64*%1], m12 + paddw m29, m8 + paddw m%3, m9 + packuswb m29, m%3 + vpermd m29, m13, m29 + mova [dstq+%%s0], xm29 + vextracti128 [dstq+%%s1], ym29, 1 + vextracti32x4 [r6 +%%s2], m29, 2 + vextracti32x4 [r6 +%%s3], m29, 3 +%endmacro + IDCT_16x64_END 0, 29, 0 + IDCT_16x64_END 1, 28, 28 + IDCT_16x64_END 2, 27, 28 + IDCT_16x64_END 3, 26, 28 + IDCT_16x64_END 4, 25, 28 + IDCT_16x64_END 5, 24, 28 + IDCT_16x64_END 6, 23, 28 + IDCT_16x64_END 7, 22, 28 + IDCT_16x64_END 8, 21, 28 + IDCT_16x64_END 9, 20, 28 + IDCT_16x64_END 10, 19, 28 + IDCT_16x64_END 11, 18, 28 + IDCT_16x64_END 12, 17, 28 + IDCT_16x64_END 13, 16, 28 + IDCT_16x64_END 14, 15, 28 + IDCT_16x64_END 15, 14, 28 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 64 + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 +ALIGN function_align +cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero + vpbroadcastd m8, [o(pw_101_4095x8)] + vpbroadcastd m21, [o(pw_m1474_3822x8)] + vpbroadcastd m14, [o(pw_897_3996x8)] + vpbroadcastd m17, [o(pw_m700_4036x8)] + vpbroadcastd m18, [o(pw_501_4065x8)] + vpbroadcastd m19, [o(pw_m1092_3948x8)] + vpbroadcastd m16, [o(pw_1285_3889x8)] + vpbroadcastd m15, [o(pw_m301_4085x8)] + pmulhrsw m8, m22 ; t32a t63a + pmulhrsw m21, m29 ; t35a t60a + pmulhrsw m14, m26 ; t36a t59a + pmulhrsw m17, m25 ; t39a t56 + pmulhrsw m18, m24 ; t40a t55a + pmulhrsw m19, m27 ; t43a t52a + pmulhrsw m16, m28 ; t44a t51a + pmulhrsw m15, m23 ; t47a t48a + mova m22, m8 + mova m29, m21 + mova m26, m14 + mova m25, m17 + mova m24, m18 + mova m27, m19 + mova m28, m16 + mova m20, m15 + jmp .main_oddhalf2 +ALIGN function_align +cglobal_label .main_oddhalf + vpbroadcastd m8, [o(pw_101_4095x8)] + vpbroadcastd m9, [o(pw_m2824_2967x8)] + vpbroadcastd m11, [o(pw_1660_3745x8)] + vpbroadcastd m12, [o(pw_m1474_3822x8)] + pmulhrsw m22, m8 ; t32a t63a + vpbroadcastd m8, [o(pw_897_3996x8)] + pmulhrsw m21, m9 ; t33a t62a + vpbroadcastd m9, [o(pw_m2191_3461x8)] + pmulhrsw m14, m11 ; t34a t61a + vpbroadcastd m11, [o(pw_2359_3349x8)] + pmulhrsw m29, m12 ; t35a t60a + vpbroadcastd m12, [o(pw_m700_4036x8)] + pmulhrsw m26, m8 ; t36a t59a + vpbroadcastd m8, [o(pw_501_4065x8)] + pmulhrsw m17, m9 ; t37a t58a + vpbroadcastd m9, [o(pw_m2520_3229x8)] + pmulhrsw m18, m11 ; t38a t57a + vpbroadcastd m11, [o(pw_2019_3564x8)] + pmulhrsw m25, m12 ; t39a t56a + vpbroadcastd m12, [o(pw_m1092_3948x8)] + pmulhrsw m24, m8 ; t40a t55a + vpbroadcastd m8, [o(pw_1285_3889x8)] + pmulhrsw m19, m9 ; t41a t54a + vpbroadcastd m9, [o(pw_m1842_3659x8)] + pmulhrsw m16, m11 ; t42a t53a + vpbroadcastd m11, [o(pw_2675_3102x8)] + pmulhrsw m27, m12 ; t43a t52a + vpbroadcastd m12, [o(pw_m301_4085x8)] + pmulhrsw m28, m8 ; t44a t51a + pmulhrsw m15, m9 ; t45a t50a + pmulhrsw m20, m11 ; t46a t49a + pmulhrsw m23, m12 ; t47a t48a + psubsw m8, m22, m21 ; t33 t62 + paddsw m22, m21 ; t32 t63 + psubsw m21, m29, m14 ; t34 t61 + paddsw m29, m14 ; t35 t60 + psubsw m14, m26, m17 ; t37 t58 + paddsw m26, m17 ; t36 t59 + psubsw m17, m25, m18 ; t38 t57 + paddsw m25, m18 ; t39 t56 + psubsw m18, m24, m19 ; t41 t54 + paddsw m24, m19 ; t40 t55 + psubsw m19, m27, m16 ; t42 t53 + paddsw m27, m16 ; t43 t52 + psubsw m16, m28, m15 ; t45 t50 + paddsw m28, m15 ; t44 t51 + psubsw m15, m23, m20 ; t46 t49 + paddsw m20, m23 ; t47 t48 +.main_oddhalf2: + ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a + ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a + ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a + ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a + ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a + ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a + ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a + ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a + vpbroadcastd m11, [o(pw_m4017_799)] + psubsw m23, m25, m26 ; t36a t59a + paddsw m25, m26 ; t39a t56a + psubsw m26, m24, m27 ; t43a t52a + paddsw m27, m24 ; t40a t55a + psubsw m24, m20, m28 ; t44a t51a + paddsw m20, m28 ; t47a t48a + psubsw m28, m8, m21 ; t34 t61 + paddsw m8, m21 ; t33 t62 + psubsw m21, m17, m14 ; t37 t58 + paddsw m17, m14 ; t38 t57 + psubsw m14, m18, m19 ; t42 t53 + paddsw m18, m19 ; t41 t54 + psubsw m19, m15, m16 ; t45 t50 + paddsw m15, m16 ; t46 t49 + psubsw m16, m22, m29 ; t35a t60a + paddsw m22, m29 ; t32a t63a + ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60 + ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a + ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59 + ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a + vpbroadcastd m11, [o(pw_m2276_3406)] + ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52 + ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a + ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51 + ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + psubsw m29, m22, m25 ; t39 t56 + paddsw m22, m25 ; t32 t63 + psubsw m25, m20, m27 ; t40 t55 + paddsw m20, m27 ; t47 t48 + psubsw m27, m8, m17 ; t38a t57a + paddsw m8, m17 ; t33a t62a + psubsw m17, m15, m18 ; t41a t54a + paddsw m15, m18 ; t46a t49a + paddsw m18, m16, m23 ; t35a t60a + psubsw m16, m23 ; t36a t59a + psubsw m23, m24, m26 ; t43a t52a + paddsw m24, m26 ; t44a t51a + paddsw m26, m28, m21 ; t34 t61 + psubsw m28, m21 ; t37 t58 + psubsw m21, m19, m14 ; t42 t53 + paddsw m19, m14 ; t45 t50 + ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a + ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57 + ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59 + ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a + vpbroadcastd m11, [o(pw_m1567_m3784)] + ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a + ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54 + ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52 + ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a + vbroadcasti32x4 m13, [o(deint_shuf)] + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + paddsw m14, m22, m20 ; t32a t63a + psubsw m22, m20 ; t47a t48a + psubsw m20, m8, m15 ; t46 t49 + paddsw m8, m15 ; t33 t62 + paddsw m15, m18, m24 ; t35 t60 + psubsw m18, m24 ; t44 t51 + psubsw m24, m26, m19 ; t45a t50a + paddsw m26, m19 ; t34a t61a + REPX {pshufb x, m13}, m14, m8, m15, m26 + psubsw m19, m29, m25 ; t40 t55 + paddsw m25, m29 ; t39 t56 + psubsw m29, m27, m17 ; t41a t54a + paddsw m27, m17 ; t38a t57a + psubsw m17, m16, m23 ; t43a t52a + paddsw m16, m23 ; t36a t59a + psubsw m9, m28, m21 ; t42 t53 + paddsw m28, m21 ; t37 t58 + REPX {pshufb x, m13}, m25, m27, m16, m28 + ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48 + ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a + packssdw m21, m22 ; t47 t46a + packssdw m13, m23 ; t48 t49a + ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a + ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50 + packssdw m20, m18 ; t44a t45 + packssdw m22, m23 ; t51a t50 + ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a + ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54 + packssdw m18, m19 ; t40a t41 + packssdw m24, m23 ; t55a t54 + ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52 + ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a + packssdw m19, m17 ; t43 t42a + packssdw m23, m29 ; t52 t53a + punpcklqdq m17, m25, m27 ; t39 t38a + punpckhqdq m25, m27 ; t56 t57a + punpckhqdq m27, m15, m26 ; t60 t61a + punpcklqdq m15, m26 ; t35 t34a + punpckhqdq m26, m16, m28 ; t59a t58 + punpcklqdq m16, m28 ; t36a t37 + punpckhqdq m28, m14, m8 ; t63a t62 + punpcklqdq m14, m8 ; t32a t33 + psubsw m29, m0, m28 ; out63 out62 + paddsw m0, m28 ; out0 out1 + psubsw m28, m1, m27 ; out60 out61 + paddsw m1, m27 ; out3 out2 + psubsw m27, m2, m26 ; out59 out58 + paddsw m2, m26 ; out4 out5 + psubsw m26, m3, m25 ; out56 out57 + paddsw m3, m25 ; out7 out6 + psubsw m25, m4, m24 ; out55 out54 + paddsw m4, m24 ; out8 out9 + psubsw m24, m5, m23 ; out52 out53 + paddsw m5, m23 ; out11 out10 + psubsw m23, m6, m22 ; out51 out50 + paddsw m6, m22 ; out12 out13 + psubsw m22, m7, m13 ; out48 out49 + paddsw m7, m13 ; out15 out14 + ret + +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jnz .normal + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 16 +.dconly: + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 +.dconly2: + imul r6d, 181 + add r6d, 128+2048 + sar r6d, 8+4 + pxor m2, m2 + vpbroadcastw m3, r6d +.dconly_loop: + mova m1, [dstq] + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + paddw m0, m3 + paddw m1, m3 + packuswb m0, m1 + mova [dstq], m0 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + WIN64_SPILL_XMM 31 + mova m19, [o(dup16_perm)] + mova m24, [cq+64* 2] + mova m28, [cq+64* 6] + mova m26, [cq+64* 4] + mova m22, [cq+64* 0] + mova m23, [cq+64* 1] + mova m29, [cq+64* 7] + mova m27, [cq+64* 5] + mova m25, [cq+64* 3] + vpermb m8, m19, m24 ; 4 + vpermb m1, m19, m28 ; 12 + vpermb m7, m19, m26 ; 8 + vpermb m9, m19, m22 ; __ 0 + vpermb m21, m19, m23 ; 2 + vpermb m15, m19, m29 ; 14 + vpermb m18, m19, m27 ; 10 + vpermb m14, m19, m25 ; 6 + pslld m9, 16 + vpord m30, m19, [o(pb_32)] {1to16} + REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23 + cmp eobd, 151 + jb .fast + vpermb m0, m19, [cq+64*14] ; 28 + vpermb m5, m19, [cq+64*10] ; 20 + vpermb m3, m19, [cq+64*12] ; 24 + vpermb m6, m19, [cq+64* 8] ; __ 16 + pslld m6, 16 + call m(idct_16x16_internal_8bpc).main_fast + vpermb m17, m19, [cq+64*15] ; 30 + vpermb m20, m19, [cq+64* 9] ; 18 + vpermb m16, m19, [cq+64*11] ; 22 + vpermb m19, m19, [cq+64*13] ; 26 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + vpermb m21, m30, [cq+64*15] + vpermb m14, m30, [cq+64* 8] + vpermb m17, m30, [cq+64*11] + vpermb m18, m30, [cq+64*12] + vpermb m19, m30, [cq+64*13] + vpermb m16, m30, [cq+64*10] + vpermb m15, m30, [cq+64* 9] + vpermb m20, m30, [cq+64*14] + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf + jmp .end +.fast: ; bottom half is zero + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast +.end: + mova [cq+64* 8], m4 + mova [cq+64* 9], m5 + mova [cq+64*10], m6 + mova [cq+64*11], m7 + mova [cq+64*12], m26 + mova [cq+64*13], m27 + mova [cq+64*14], m28 + mova [cq+64*15], m29 + vpbroadcastd m13, [o(pw_8192)] + call .pass1_end + call .pass2 + mova [cq+64* 0], m0 + mova [cq+64* 1], m1 + mova [cq+64* 2], m2 + mova [cq+64* 3], m3 + mova [cq+64* 4], m4 + mova [cq+64* 5], m5 + mova [cq+64* 6], m6 + mova [cq+64* 7], m7 + pmulhrsw m0, m13, [cq+64* 8] + pmulhrsw m1, m13, [cq+64* 9] + pmulhrsw m2, m13, [cq+64*10] + pmulhrsw m3, m13, [cq+64*11] + vpbroadcastd m30, [o(pw_2048)] + pmulhrsw m4, m13, m22 + pmulhrsw m5, m13, m23 + pmulhrsw m6, m13, m24 + pmulhrsw m7, m13, m25 + pmulhrsw m22, m30, m14 + pmulhrsw m14, m13, m26 + pmulhrsw m23, m30, m15 + pmulhrsw m15, m13, m27 + pmulhrsw m24, m30, m16 + pmulhrsw m16, m13, m28 + pmulhrsw m25, m30, m17 + pmulhrsw m17, m13, m29 + pmulhrsw m26, m30, m18 + pmulhrsw m18, m13, [cq+64*12] + pmulhrsw m27, m30, m19 + pmulhrsw m19, m13, [cq+64*13] + pmulhrsw m28, m30, m20 + pmulhrsw m20, m13, [cq+64*14] + pmulhrsw m29, m30, m21 + pmulhrsw m21, m13, [cq+64*15] + call .transpose_round + call .pass2 + pxor m10, m10 + lea r3, [strideq*3] +%macro IDCT_64x16_END 4 + mova m9, [dstq+%4] +%if %1 < 8 + pmulhrsw m%3, m30, [cq+64*%1] +%endif + pmulhrsw m%2, m30 + mova [cq+64*%1], m10 + punpcklbw m8, m9, m10 + punpckhbw m9, m10 + paddw m8, m%3 + paddw m9, m%2 + packuswb m8, m9 + mova [dstq+%4], m8 +%if %1 == 3 || %1 == 7 || %1 == 11 + lea dstq, [dstq+strideq*4] +%endif +%endmacro + IDCT_64x16_END 0, 0, 11, strideq*0 + IDCT_64x16_END 1, 1, 11, strideq*1 + IDCT_64x16_END 2, 2, 11, strideq*2 + IDCT_64x16_END 3, 3, 11, r3 + IDCT_64x16_END 4, 4, 11, strideq*0 + IDCT_64x16_END 5, 5, 11, strideq*1 + IDCT_64x16_END 6, 6, 11, strideq*2 + IDCT_64x16_END 7, 7, 11, r3 + IDCT_64x16_END 8, 14, 22, strideq*0 + IDCT_64x16_END 9, 15, 23, strideq*1 + IDCT_64x16_END 10, 16, 24, strideq*2 + IDCT_64x16_END 11, 17, 25, r3 + IDCT_64x16_END 12, 18, 26, strideq*0 + IDCT_64x16_END 13, 19, 27, strideq*1 + IDCT_64x16_END 14, 20, 28, strideq*2 + IDCT_64x16_END 15, 21, 29, r3 + RET +ALIGN function_align +.pass1_end: + mova m4, [cq+64* 0] + mova m5, [cq+64* 1] + mova m6, [cq+64* 2] + mova m7, [cq+64* 3] + mova m8, [cq+64* 4] + mova m9, [cq+64* 5] + mova m11, [cq+64* 6] + mova m12, [cq+64* 7] + psubsw m29, m4, m21 ; out47 out46 + paddsw m4, m21 ; out16 out17 + psubsw m28, m5, m20 ; out44 out45 + paddsw m5, m20 ; out19 out18 + REPX {pmulhrsw x, m13}, m0, m1, m2, m3 + psubsw m27, m6, m19 ; out43 out42 + paddsw m6, m19 ; out20 out21 + psubsw m26, m7, m18 ; out40 out41 + paddsw m7, m18 ; out23 out22 + pmulhrsw m18, m13, m22 + pmulhrsw m19, m13, m23 + pmulhrsw m20, m13, m24 + pmulhrsw m21, m13, m25 + paddsw m25, m12, m14 ; out31 out30 + psubsw m14, m12, m14 ; out32 out33 + paddsw m24, m11, m15 ; out28 out29 + psubsw m15, m11, m15 ; out35 out34 + REPX {pmulhrsw x, m13}, m4, m5, m6, m7 + paddsw m23, m9, m16 ; out27 out26 + psubsw m16, m9, m16 ; out36 out37 + paddsw m22, m8, m17 ; out24 out25 + psubsw m17, m8, m17 ; out39 out38 + REPX {pmulhrsw x, m13}, m14, m15, m16, m17 +.transpose_round: +%macro TRANSPOSE_8x4_PACKED 4 + punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3 + punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3 + punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3 + punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3 + punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3 + punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1 + punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3 + punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1 + punpcklwd m%3, m%4, m%2 ; 2 + punpckhwd m%4, m%2 ; 3 + punpckhwd m%2, m%1, m8 ; 1 + punpcklwd m%1, m8 ; 0 +%endmacro + TRANSPOSE_8x4_PACKED 0, 1, 2, 3 + TRANSPOSE_8x4_PACKED 18, 19, 20, 21 + TRANSPOSE_8x4_PACKED 4, 5, 6, 7 + TRANSPOSE_8x4_PACKED 14, 15, 16, 17 + vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03 + vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01 + vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13 + vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11 + vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23 + vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21 + vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33 + vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31 + vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03 + vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01 + vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13 + vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11 + vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23 + vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21 + vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33 + vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 + ret +.pass2: + vshufi32x4 m7, m5, m19, q3131 ; 14 + vshufi32x4 m5, m19, q2020 ; 10 + vshufi32x4 m21, m6, m20, q3131 ; 15 + vshufi32x4 m19, m6, m20, q2020 ; 11 + vshufi32x4 m20, m4, m18, q3131 ; 13 + vshufi32x4 m18, m4, m18, q2020 ; 9 + vshufi32x4 m6, m8, m2, q3131 ; 12 + vshufi32x4 m4, m8, m2, q2020 ; 8 + vshufi32x4 m2, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m1, m16, q3131 ; 6 + vshufi32x4 m1, m16, q2020 ; 2 + vshufi32x4 m16, m9, m15, q3131 ; 5 + vshufi32x4 m14, m9, m15, q2020 ; 1 + vshufi32x4 m15, m11, m17, q2020 ; 3 + vshufi32x4 m17, m11, m17, q3131 ; 7 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 + jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob + vpbroadcastd m23, [o(pw_2896x8)] +%undef cmp + cmp eobd, 136 + jb .fast + pmulhrsw m5, m23, [cq+64*20] + pmulhrsw m3, m23, [cq+64*12] + pmulhrsw m1, m23, [cq+64* 4] + pmulhrsw m7, m23, [cq+64*28] + pmulhrsw m2, m23, [cq+64* 8] + pmulhrsw m6, m23, [cq+64*24] + pmulhrsw m0, m23, [cq+64* 0] + pmulhrsw m4, m23, [cq+64*16] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + pmulhrsw m14, m23, [cq+64* 2] + pmulhrsw m21, m23, [cq+64*30] + pmulhrsw m18, m23, [cq+64*18] + pmulhrsw m17, m23, [cq+64*14] + pmulhrsw m16, m23, [cq+64*10] + pmulhrsw m19, m23, [cq+64*22] + pmulhrsw m20, m23, [cq+64*26] + pmulhrsw m15, m23, [cq+64* 6] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + pmulhrsw m22, m23, [cq+64* 1] + pmulhrsw m21, m23, [cq+64*31] + pmulhrsw m14, m23, [cq+64*17] + pmulhrsw m29, m23, [cq+64*15] + pmulhrsw m26, m23, [cq+64* 9] + pmulhrsw m17, m23, [cq+64*23] + pmulhrsw m18, m23, [cq+64*25] + pmulhrsw m25, m23, [cq+64* 7] + pmulhrsw m24, m23, [cq+64* 5] + pmulhrsw m19, m23, [cq+64*27] + pmulhrsw m16, m23, [cq+64*21] + pmulhrsw m27, m23, [cq+64*11] + pmulhrsw m28, m23, [cq+64*13] + pmulhrsw m15, m23, [cq+64*19] + pmulhrsw m20, m23, [cq+64*29] + pmulhrsw m23, [cq+64* 3] + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf + vpbroadcastd m12, [o(pw_16384)] + psubsw m13, m0, m29 ; 31 + paddsw m0, m29 ; 0 + psubsw m29, m1, m28 ; 30 + paddsw m1, m28 ; 1 + psubsw m28, m2, m27 ; 29 + paddsw m2, m27 ; 2 + psubsw m27, m3, m26 ; 28 + paddsw m3, m26 ; 3 + psubsw m26, m4, m25 ; 27 + paddsw m4, m25 ; 4 + psubsw m25, m5, m24 ; 26 + paddsw m5, m24 ; 5 + psubsw m24, m6, m23 ; 25 + paddsw m6, m23 ; 6 + psubsw m23, m7, m22 ; 24 + paddsw m7, m22 ; 7 + pxor m9, m9 + punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 + REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 + punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 + punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 + punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 + punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 + REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 + punpckhwd m3, m23, m24 + punpcklwd m23, m24 + punpckhwd m24, m25, m26 + punpcklwd m25, m26 + REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 + punpckhwd m26, m27, m28 + punpcklwd m27, m28 + punpckhwd m28, m29, m13 + punpcklwd m29, m13 + REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 + punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 + REPX {pmulhrsw x, m12}, m7, m0, m2, m4 + punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 + punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 + punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 + REPX {pmulhrsw x, m12}, m6, m8, m1, m22 + punpckhdq m13, m23, m25 + punpckldq m23, m25 + punpckhdq m25, m27, m29 + punpckldq m27, m29 + REPX {pmulhrsw x, m12}, m13, m23, m25, m27 + punpckhdq m9, m3, m24 + punpckldq m3, m24 + punpckhdq m24, m26, m28 + punpckldq m26, m28 + REPX {pmulhrsw x, m12}, m9, m3, m24, m26 + punpckhqdq m5, m23, m27 ; d01 d09 d17 d25 + punpcklqdq m23, m27 ; d00 d08 d16 d24 + punpcklqdq m27, m13, m25 ; d02 d10 d18 d26 + punpckhqdq m13, m25 ; d03 d11 d19 d27 + punpcklqdq m25, m3, m26 ; d04 d12 d20 d28 + punpckhqdq m3, m26 ; d05 d13 d21 d29 + punpcklqdq m26, m9, m24 ; d06 d14 d22 d30 + punpckhqdq m9, m24 ; d07 d15 d23 d31 + mova [cq+64* 3], m23 + mova [cq+64*13], m27 + mova [cq+64* 7], m25 + mova [cq+64*15], m26 + punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 + punpcklqdq m8, m22 ; a04 a12 a20 a28 + punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 + punpcklqdq m0, m4 ; a00 a08 a16 a24 + punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 + punpcklqdq m7, m2 ; a02 a10 a18 a26 + punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 + punpcklqdq m6, m1 ; a06 a14 a22 a30 + mova [cq+64* 1], m0 + mova [cq+64* 9], m7 + mova [cq+64* 5], m8 + mova [cq+64*11], m6 + mova m2, [cq+64* 0] + mova m11, [cq+64* 2] + mova m8, [cq+64* 4] + mova m29, [cq+64* 6] + mova m27, [cq+64* 8] + mova m26, [cq+64*10] + mova m4, [cq+64*12] + mova m28, [cq+64*14] + psubsw m1, m2, m21 ; 23 + paddsw m2, m21 ; 8 + psubsw m21, m11, m20 ; 22 + paddsw m11, m20 ; 9 + psubsw m20, m8, m19 ; 21 + paddsw m8, m19 ; 10 + psubsw m19, m29, m18 ; 20 + paddsw m29, m18 ; 11 + psubsw m18, m27, m17 ; 19 + paddsw m27, m17 ; 12 + psubsw m17, m26, m16 ; 18 + paddsw m26, m16 ; 13 + psubsw m16, m4, m15 ; 17 + paddsw m4, m15 ; 14 + psubsw m15, m28, m14 ; 16 + paddsw m28, m14 ; 15 + punpcklwd m14, m15, m16 + punpckhwd m15, m16 + punpckhwd m16, m17, m18 + punpcklwd m17, m18 + punpckhwd m18, m19, m20 + punpcklwd m19, m20 + punpckhwd m20, m21, m1 + punpcklwd m21, m1 + punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 + punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 + punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 + punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 + punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 + punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 + punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 + punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 + punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1 + REPX {pmulhrsw x, m12}, m28, m2, m8, m27 + punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 + punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 + punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 + punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 + REPX {pmulhrsw x, m12}, m4, m1, m11, m29 + punpckhdq m26, m19, m21 + punpckldq m19, m21 + punpckhdq m21, m15, m16 + punpckldq m15, m16 + REPX {pmulhrsw x, m12}, m26, m19, m21, m15 + punpckhdq m16, m18, m20 + punpckldq m18, m20 + punpckhdq m20, m14, m17 + punpckldq m14, m17 + REPX {pmulhrsw x, m12}, m16, m18, m20, m14 + punpckhqdq m17, m28, m8 ; b03 b11 b19 b27 + punpcklqdq m28, m8 ; b02 b10 b18 b26 + punpckhqdq m8, m2, m27 ; b01 b09 b17 b25 + punpcklqdq m2, m27 ; b00 b08 b16 b24 + punpcklqdq m27, m1, m29 ; b04 b12 b20 b28 + punpckhqdq m1, m29 ; b05 b13 b21 b29 + punpcklqdq m29, m4, m11 ; b06 b14 b22 b30 + punpckhqdq m4, m11 ; b07 b15 b23 b31 + mova [cq+64* 0], m2 + mova [cq+64* 8], m28 + mova [cq+64* 4], m27 + mova [cq+64*10], m29 + punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 + punpcklqdq m20, m26 ; c02 c10 c18 c26 + punpckhqdq m26, m14, m19 ; c01 c09 c17 c25 + punpcklqdq m14, m19 ; c00 c08 c16 c24 + punpckhqdq m28, m15, m18 ; c05 c13 c21 c29 + punpcklqdq m15, m18 ; c04 c12 c20 c28 + punpckhqdq m29, m21, m16 ; c07 c15 c23 c31 + punpcklqdq m21, m16 ; c06 c14 c22 c30 + mova [cq+64* 2], m14 + mova [cq+64*12], m20 + mova [cq+64* 6], m15 + mova [cq+64*14], m21 + vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25 + vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09 + vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27 + vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11 + vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29 + vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13 + vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31 + vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15 + vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09 + vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25 + vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11 + vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27 + vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13 + vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29 + vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15 + vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31 + mov r4, rsp + vshufi32x4 m0, m22, m19, q2020 ; 1 + vshufi32x4 m1, m17, m29, q3131 ; 31 + vshufi32x4 m2, m14, m26, q2020 ; 17 + vshufi32x4 m3, m25, m18, q3131 ; 15 + call .main_part1 + vshufi32x4 m0, m25, m18, q2020 ; 7 + vshufi32x4 m1, m14, m26, q3131 ; 25 + vshufi32x4 m2, m17, m29, q2020 ; 23 + vshufi32x4 m3, m22, m19, q3131 ; 9 + call .main_part1 + vshufi32x4 m0, m24, m21, q2020 ; 5 + vshufi32x4 m1, m15, m27, q3131 ; 27 + vshufi32x4 m2, m16, m28, q2020 ; 21 + vshufi32x4 m3, m23, m20, q3131 ; 11 + call .main_part1 + vshufi32x4 m0, m23, m20, q2020 ; 3 + vshufi32x4 m1, m16, m28, q3131 ; 29 + vshufi32x4 m2, m15, m27, q2020 ; 19 + vshufi32x4 m3, m24, m21, q3131 ; 13 + call .main_part1 + call .main_part2 + mova m0, [cq+64* 1] ; a0 + mova m15, [cq+64* 0] ; b0 + mova m3, [cq+64* 2] ; c0 + mova m16, [cq+64* 3] ; d0 + mova m14, [cq+64* 5] ; a4 + mova m8, [cq+64* 4] ; b4 + mova m17, [cq+64* 6] ; c4 + mova m1, [cq+64* 7] ; d4 + vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24 + vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08 + vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24 + vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08 + vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28 + vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12 + vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28 + vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12 + vshufi32x4 m1, m0, m3, q3131 ; 8 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m2, m15, q3131 ; 24 + vshufi32x4 m2, m15, q2020 ; 16 + vshufi32x4 m15, m14, m17, q3131 ; 12 + vshufi32x4 m14, m17, q2020 ; 4 + vshufi32x4 m17, m16, m8, q3131 ; 28 + vshufi32x4 m16, m8, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova m8, [cq+64* 8] + mova m9, [cq+64*12] + mova m11, [cq+64*10] + mova m12, [cq+64*14] + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + mova m22, [cq+64* 9] + mova m27, [cq+64*13] + mova m23, [cq+64*11] + mova m24, [cq+64*15] + vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26 + vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10 + vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26 + vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10 + vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30 + vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14 + vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30 + vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14 + vshufi32x4 m28, m26, m8, q3131 ; 26 + vshufi32x4 m26, m8, q2020 ; 18 + vshufi32x4 m24, m22, m9, q3131 ; 10 + vshufi32x4 m22, m9, q2020 ; 2 + vshufi32x4 m29, m27, m11, q3131 ; 30 + vshufi32x4 m27, m11, q2020 ; 22 + vshufi32x4 m25, m23, m12, q3131 ; 14 + vshufi32x4 m23, m12, q2020 ; 6 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + jmp .end +.fast: ; bottom/right halves are zero + pmulhrsw ym9, ym23, [cq+64* 0] + pmulhrsw ym6, ym23, [cq+64* 8] + mova m14, [o(dup16_perm)] + pmulhrsw ym8, ym23, [cq+64* 2] + pmulhrsw xm0, xm23, [cq+64*14] + pmulhrsw xm5, xm23, [cq+64*10] + pmulhrsw ym1, ym23, [cq+64* 6] + pmulhrsw ym7, ym23, [cq+64* 4] + pmulhrsw xm3, xm23, [cq+64*12] + pmovzxwd m9, ym9 + pmovzxwd m6, ym6 + vpermb m8, m14, m8 + punpcklwd xm0, xm0 + vpermb ym5, ym14, ym5 + vpermb m1, m14, m1 + vpermb m7, m14, m7 + punpcklwd xm3, xm3 + pslld m9, 16 + pslld m6, 16 + call m(idct_16x16_internal_8bpc).main_fast + vpmulhrsw ym21, ym23, [cq+64* 1] + {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which + {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to + {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements + {evex}vpmulhrsw ym18, ym23, [cq+64* 5] + {evex}vpmulhrsw xm16, xm23, [cq+64*11] + {evex}vpmulhrsw xm19, xm23, [cq+64*13] + {evex}vpmulhrsw ym23, [cq+64* 3] + vpermb m21, m14, m21 + punpcklwd xm17, xm17 + vpermb ym20, ym14, ym20 + vpermb m15, m14, m15 + vpermb m18, m14, m18 + vpermb ym16, ym14, ym16 + punpcklwd xm19, xm19 + vpermb m14, m14, m23 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m9, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round + vshufi32x4 m16, m0, m3, q2020 ; 0 + vshufi32x4 m26, m0, m3, q3131 ; 4 + vshufi32x4 m0, m14, m2, q2020 ; 1 + vshufi32x4 m14, m2, q3131 ; 5 + vshufi32x4 m3, m19, m7, q3131 ; 15 + vshufi32x4 m19, m7, q2020 ; 11 + vshufi32x4 m27, m17, m9, q2020 ; 3 + vshufi32x4 m17, m9, q3131 ; 7 + vshufi32x4 m28, m20, m6, q2020 ; 9 + vshufi32x4 m20, m6, q3131 ; 13 + vshufi32x4 m22, m1, m18, q2020 ; 2 + vshufi32x4 m23, m1, m18, q3131 ; 6 + vshufi32x4 m24, m5, m15, q2020 ; 10 + vshufi32x4 m25, m5, m15, q3131 ; 14 + vshufi32x4 m15, m21, m4, q3131 ; 12 + vshufi32x4 m21, m21, m4, q2020 ; 8 + mov r4, rsp + call .main_part1_fast + mova m0, m17 + mova m3, m28 + call .main_part1_fast + mova m0, m14 + mova m3, m19 + call .main_part1_fast + mova m0, m27 + mova m3, m20 + call .main_part1_fast + call .main_part2 + mova m0, m16 + mova m1, m21 + mova m14, m26 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 + mova [cq+64*14], m21 + mova [cq+64* 0], m14 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64* 4], m16 + mova [cq+64* 2], m15 + mova [cq+64*12], m20 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 +.end: + lea r4, [strideq*3] + vpbroadcastd m12, [o(pw_2048)] + movshdup m13, [o(permD)] + lea r5, [r4+strideq] ; stride*4 + lea r3, [dstq+r4*8] + lea r6, [strideq+r5*8] ; stride*33 + lea r8, [r4+r5*8] ; stride*35 + add r3, r5 ; dst+stride*28 + lea r7, [r6+strideq] ; stride*34 +%macro IDCT_32x64_END 6 ; src, mem, stride[1-4] +%if %2 < 8 + paddsw m10, m%2, m%1 + psubsw m11, m%2, m%1 +%else + mova m11, [cq+64*(%2*2-16)] + paddsw m10, m11, m%1 + psubsw m11, m%1 +%endif + mova m9, [rsp+64*(31-%2)] + mova m%1, [rsp+64*%2] + paddsw m8, m10, m9 + psubsw m10, m9 + paddsw m9, m11, m%1 + pmovzxbw m0, [dstq+%3] + psubsw m11, m%1 + pmovzxbw m%1, [r3 +%4] + REPX {pmulhrsw x, m12}, m8, m10, m9, m11 + paddw m8, m0 + pmovzxbw m0, [r3 +%5] + paddw m10, m%1 + pmovzxbw m%1, [dstq+%6] + paddw m9, m0 + paddw m11, m%1 +%if %2 >= 8 +%if %2 == 8 + pxor m1, m1 +%endif + mova [cq+64*(%2*2-16)], m1 + mova [cq+64*(%2*2-15)], m1 +%endif + packuswb m8, m10 + packuswb m9, m11 + vpermq m8, m13, m8 + vpermq m9, m13, m9 + mova [dstq+%3], ym8 + vextracti32x8 [r3 +%4], m8, 1 + mova [r3 +%5], ym9 + vextracti32x8 [dstq+%6], m9, 1 +%if %2 == 3 || %2 == 7 || %2 == 11 + add dstq, r5 + sub r3, r5 +%endif +%endmacro + IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8 + IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8 + IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8 + IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 64 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 +ALIGN function_align ; bottom three-quarters are zero +.main_part1_fast: + vpbroadcastd m1, [o(idct64_mul+4*0)] + vpbroadcastd m8, [o(idct64_mul+4*1)] + vpbroadcastd m2, [o(idct64_mul+4*6)] + vpbroadcastd m9, [o(idct64_mul+4*7)] + pmulhrsw m1, m0 ; t63a + pmulhrsw m0, m8 ; t32a + pmulhrsw m2, m3 ; t60a + pmulhrsw m3, m9 ; t35a + mova m8, m0 + mova m7, m1 + mova m6, m3 + mova m5, m2 + jmp .main_part1b +.main_part1: + ; idct64 steps 1-5: + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vpbroadcastd m7, [o(idct64_mul+4*0)] + vpbroadcastd m8, [o(idct64_mul+4*1)] + vpbroadcastd m6, [o(idct64_mul+4*2)] + vpbroadcastd m9, [o(idct64_mul+4*3)] + pmulhrsw m7, m0 ; t63a + vpbroadcastd m5, [o(idct64_mul+4*4)] + pmulhrsw m0, m8 ; t32a + vpbroadcastd m8, [o(idct64_mul+4*5)] + pmulhrsw m6, m1 ; t62a + vpbroadcastd m4, [o(idct64_mul+4*6)] + pmulhrsw m1, m9 ; t33a + vpbroadcastd m9, [o(idct64_mul+4*7)] + pmulhrsw m5, m2 ; t61a + pmulhrsw m2, m8 ; t34a + pmulhrsw m4, m3 ; t60a + pmulhrsw m3, m9 ; t35a + psubsw m8, m0, m1 ; t33 + paddsw m0, m1 ; t32 + psubsw m1, m7, m6 ; t62 + paddsw m7, m6 ; t63 + psubsw m6, m3, m2 ; t34 + paddsw m3, m2 ; t35 + psubsw m2, m4, m5 ; t61 + paddsw m5, m4 ; t60 +.main_part1b: + vpbroadcastd m11, [o(idct64_mul+4*8)] + vpbroadcastd m12, [o(idct64_mul+4*9)] + ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a + vpbroadcastd m11, [o(idct64_mul+4*10)] + ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a + vpbroadcastd m11, [o(idct64_mul+4*11)] + vpbroadcastd m12, [o(idct64_mul+4*12)] + psubsw m4, m0, m3 ; t35a + paddsw m0, m3 ; t32a + psubsw m3, m7, m5 ; t60a + paddsw m7, m5 ; t63a + psubsw m5, m1, m2 ; t34 + paddsw m1, m2 ; t33 + psubsw m2, m8, m6 ; t61 + paddsw m6, m8 ; t62 + add r5, 4*13 + ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60 + ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a + mova [r4+64*0], m0 + mova [r4+64*7], m7 + mova [r4+64*1], m1 + mova [r4+64*6], m6 + mova [r4+64*3], m3 + mova [r4+64*4], m4 + mova [r4+64*2], m2 + mova [r4+64*5], m5 + add r4, 64*8 + ret +.main_part2: + vpbroadcastd m11, [o(pw_1567_3784 -16*13)] + vpbroadcastd m12, [o(pw_m3784_1567 -16*13)] + lea r6, [r4+64*7] + vpbroadcastd m17, [o(pw_m1567_m3784-16*13)] + vpbroadcastd m18, [o(pw_2896_2896 -16*13)] + vpbroadcastd m19, [o(pw_m2896_2896 -16*13)] + sub r5, 16*13 +.main_part2_loop: + mova m0, [r4-64*32] ; t32a + mova m1, [r6-64*24] ; t39a + mova m2, [r6-64*32] ; t63a + mova m3, [r4-64*24] ; t56a + mova m4, [r4-64*16] ; t40a + mova m5, [r6-64* 8] ; t47a + mova m6, [r6-64*16] ; t55a + mova m7, [r4-64* 8] ; t48a + psubsw m8, m0, m1 ; t39 + paddsw m0, m1 ; t32 + psubsw m1, m2, m3 ; t56 + paddsw m2, m3 ; t63 + psubsw m3, m5, m4 ; t40 + paddsw m5, m4 ; t47 + psubsw m4, m7, m6 ; t55 + paddsw m7, m6 ; t48 + ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a + ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a + psubsw m6, m2, m7 ; t48a + paddsw m2, m7 ; t63a + psubsw m7, m0, m5 ; t47a + paddsw m0, m5 ; t32a + psubsw m5, m8, m3 ; t55 + paddsw m8, m3 ; t56 + psubsw m3, m1, m4 ; t40 + paddsw m1, m4 ; t39 + ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48 + ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a + mova [r6-64* 8], m2 + mova [r4-64*32], m0 + mova [r4-64* 8], m8 + mova [r6-64*32], m1 + mova [r6-64*24], m6 + mova [r4-64*16], m7 + mova [r4-64*24], m5 + mova [r6-64*16], m3 + add r4, 64 + sub r6, 64 + cmp r4, r6 + jb .main_part2_loop + ret + +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob + vpbroadcastd m23, [o(pw_2896x8)] +%undef cmp + cmp eobd, 136 + jb .fast + pmulhrsw m0, m23, [cq+64* 1] + pmulhrsw m1, m23, [cq+64*31] + pmulhrsw m2, m23, [cq+64*17] + pmulhrsw m3, m23, [cq+64*15] + vpbroadcastd m10, [o(pd_2048)] + mov r4, rsp + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + pmulhrsw m0, m23, [cq+64* 7] + pmulhrsw m1, m23, [cq+64*25] + pmulhrsw m2, m23, [cq+64*23] + pmulhrsw m3, m23, [cq+64* 9] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + pmulhrsw m0, m23, [cq+64* 5] + pmulhrsw m1, m23, [cq+64*27] + pmulhrsw m2, m23, [cq+64*21] + pmulhrsw m3, m23, [cq+64*11] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + pmulhrsw m0, m23, [cq+64* 3] + pmulhrsw m1, m23, [cq+64*29] + pmulhrsw m2, m23, [cq+64*19] + pmulhrsw m3, m23, [cq+64*13] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + pmulhrsw m3, m23, [cq+64*24] + pmulhrsw m1, m23, [cq+64* 8] + pmulhrsw m2, m23, [cq+64*16] + pmulhrsw m0, m23, [cq+64* 0] + pmulhrsw m14, m23, [cq+64* 4] + pmulhrsw m17, m23, [cq+64*28] + pmulhrsw m16, m23, [cq+64*20] + pmulhrsw m15, m23, [cq+64*12] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + pmulhrsw m22, m23, [cq+64* 2] + pmulhrsw m29, m23, [cq+64*30] + pmulhrsw m26, m23, [cq+64*18] + pmulhrsw m25, m23, [cq+64*14] + pmulhrsw m24, m23, [cq+64*10] + pmulhrsw m27, m23, [cq+64*22] + pmulhrsw m28, m23, [cq+64*26] + pmulhrsw m23, [cq+64* 6] + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_16384)] + call .pass1_end_part1 + mova [cq+64*16], m1 + mova [cq+64*17], m3 + mova [cq+64*18], m5 + mova [cq+64*19], m7 + mova [cq+64*24], m23 + mova [cq+64*25], m25 + mova [cq+64*26], m27 + mova [cq+64*27], m29 + pmulhrsw m23, m13, m0 ; a0 + pmulhrsw m25, m13, m2 ; a2 + pmulhrsw m27, m13, m4 ; a4 + pmulhrsw m29, m13, m6 ; a6 + REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6 + call .pass1_end_part2 + mova [cq+64*20], m15 + mova [cq+64*21], m17 + mova [cq+64*22], m19 + mova [cq+64*23], m21 + mova [cq+64*28], m1 + mova [cq+64*29], m3 + mova [cq+64*30], m5 + mova [cq+64*31], m7 + REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6 + REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6 + vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01 + vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03 + vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01 + vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03 + vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41 + vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43 + vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 + vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43 + vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21 + vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23 + vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21 + vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23 + vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61 + vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 + vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61 + vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63 + vshufi32x4 m2, m3, m15, q3131 ; 8 + vshufi32x4 m0, m3, m15, q2020 ; 0 + vshufi32x4 m6, m23, m22, q3131 ; 24 + vshufi32x4 m4, m23, m22, q2020 ; 16 + vshufi32x4 m3, m1, m18, q3131 ; 12 + vshufi32x4 m1, m18, q2020 ; 4 + vshufi32x4 m7, m27, m26, q3131 ; 28 + vshufi32x4 m5, m27, m26, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + vshufi32x4 m16, m14, m17, q3131 ; 10 + vshufi32x4 m14, m17, q2020 ; 2 + vshufi32x4 m17, m19, m20, q3131 ; 14 + vshufi32x4 m15, m19, m20, q2020 ; 6 + vshufi32x4 m20, m25, m24, q3131 ; 26 + vshufi32x4 m18, m25, m24, q2020 ; 18 + vshufi32x4 m21, m29, m28, q3131 ; 30 + vshufi32x4 m19, m29, m28, q2020 ; 22 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + pmulhrsw m22, m13, [cq+64*16] ; a1 + pmulhrsw m23, m13, [cq+64*20] ; c1 + pmulhrsw m24, m13, [cq+64*24] ; e1 + pmulhrsw m25, m13, [cq+64*28] ; g1 + pmulhrsw m26, m13, [cq+64*17] ; a3 + pmulhrsw m27, m13, [cq+64*21] ; c3 + pmulhrsw m28, m13, [cq+64*25] ; e3 + pmulhrsw m29, m13, [cq+64*29] ; g3 + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + pmulhrsw m14, m13, [cq+64*18] ; a5 + pmulhrsw m15, m13, [cq+64*22] ; c5 + pmulhrsw m16, m13, [cq+64*26] ; e5 + pmulhrsw m17, m13, [cq+64*30] ; g5 + pmulhrsw m18, m13, [cq+64*19] ; a7 + pmulhrsw m19, m13, [cq+64*23] ; c7 + pmulhrsw m20, m13, [cq+64*27] ; e7 + pmulhrsw m21, m13, [cq+64*31] ; g7 + vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11 + vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13 + vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11 + vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13 + vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31 + vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33 + vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31 + vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33 + mova [cq+64* 0], m0 + mova [cq+64* 1], m1 + mova [cq+64* 2], m2 + mova [cq+64* 3], m3 + mova [cq+64* 4], m4 + mova [cq+64* 5], m5 + mova [cq+64* 6], m6 + mova [cq+64* 7], m7 + vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51 + vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53 + vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51 + vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53 + vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71 + vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73 + vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71 + vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73 + vshufi32x4 m27, m23, m11, q3131 ; 11 m27 + vshufi32x4 m23, m11, q2020 ; 3 m23 + vshufi32x4 m19, m26, m28, q3131 ; 27 m19 + vshufi32x4 m15, m26, m28, q2020 ; 19 m15 + vshufi32x4 m29, m25, m17, q3131 ; 15 m29 + vshufi32x4 m25, m17, q2020 ; 7 m25 + vshufi32x4 m21, m18, m20, q3131 ; 31 m21 + vshufi32x4 m17, m18, m20, q2020 ; 23 m17 + vshufi32x4 m20, m14, m16, q3131 ; 29 m20 + vshufi32x4 m16, m14, m16, q2020 ; 21 m16 + vshufi32x4 m18, m22, m24, q3131 ; 25 m18 + vshufi32x4 m14, m22, m24, q2020 ; 17 m14 + vshufi32x4 m26, m8, m9, q3131 ; 9 m26 + vshufi32x4 m22, m8, m9, q2020 ; 1 m22 + vshufi32x4 m28, m12, m13, q3131 ; 13 m28 + vshufi32x4 m24, m12, m13, q2020 ; 5 m24 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf + vpbroadcastd m13, [o(pw_16384)] + pmulhrsw m0, m13, [r4-64*21] + pmulhrsw m1, m13, [r4-64*22] + pmulhrsw m2, m13, [r4-64*23] + pmulhrsw m3, m13, [r4-64*24] + pmulhrsw m4, m13, [r4-64*25] + pmulhrsw m5, m13, [r4-64*26] + pmulhrsw m6, m13, [r4-64*27] + pmulhrsw m7, m13, [r4-64*28] + mova [cq+64*16], m14 + mova [cq+64*17], m15 + mova [cq+64*18], m16 + mova [cq+64*19], m17 + mova [cq+64*20], m18 + mova [cq+64*21], m19 + mova [cq+64*22], m20 + mova [cq+64*23], m21 + pmulhrsw m14, m13, [r4-64*12] + pmulhrsw m15, m13, [r4-64*11] + pmulhrsw m16, m13, [r4-64*10] + pmulhrsw m17, m13, [r4-64* 9] + pmulhrsw m18, m13, [r4-64* 8] + pmulhrsw m19, m13, [r4-64* 7] + pmulhrsw m20, m13, [r4-64* 6] + pmulhrsw m21, m13, [r4-64* 5] + mova [cq+64*24], m22 + mova [cq+64*25], m23 + mova [cq+64*26], m24 + mova [cq+64*27], m25 + mova [cq+64*28], m26 + mova [cq+64*29], m27 + mova [cq+64*30], m28 + mova [cq+64*31], m29 + call .transpose_2x8x8_lo + mova [r4-64*12], m1 + mova [r4-64*11], m3 + mova [r4-64*10], m5 + mova [r4-64* 9], m7 + mova [r4-64* 8], m15 + mova [r4-64* 7], m17 + mova [r4-64* 6], m19 + mova [r4-64* 5], m21 + vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01 + vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03 + vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21 + vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23 + vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41 + vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43 + vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61 + vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63 + pmulhrsw m0, m13, [r4-64*20] + pmulhrsw m1, m13, [r4-64*19] + pmulhrsw m2, m13, [r4-64*18] + pmulhrsw m3, m13, [r4-64*17] + pmulhrsw m4, m13, [r4-64*16] + pmulhrsw m5, m13, [r4-64*15] + pmulhrsw m6, m13, [r4-64*14] + pmulhrsw m7, m13, [r4-64*13] + pmulhrsw m14, m13, [r4-64*29] + pmulhrsw m15, m13, [r4-64*30] + pmulhrsw m16, m13, [r4-64*31] + pmulhrsw m17, m13, [r4-64*32] + pmulhrsw m18, m13, [r4-64*33] + pmulhrsw m19, m13, [r4-64*34] + pmulhrsw m20, m13, [r4-64*35] + pmulhrsw m21, m13, [r4-64*36] + call .transpose_2x8x8_lo + mova [r4-64*20], m1 + mova [r4-64*19], m3 + mova [r4-64*18], m5 + mova [r4-64*17], m7 + mova [r4-64*16], m15 + mova [r4-64*15], m17 + mova [r4-64*14], m19 + mova [r4-64*13], m21 + vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41 + vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43 + vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03 + vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01 + vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21 + vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23 + vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61 + vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63 + vshufi32x4 m2, m0, m22, q3131 ; 8 + vshufi32x4 m0, m22, q2020 ; 0 + vshufi32x4 m3, m1, m26, q3131 ; 12 + vshufi32x4 m1, m26, q2020 ; 4 + vshufi32x4 m6, m4, m23, q3131 ; 24 + vshufi32x4 m4, m23, q2020 ; 16 + vshufi32x4 m7, m5, m27, q3131 ; 28 + vshufi32x4 m5, m27, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + vshufi32x4 m16, m14, m24, q3131 ; 10 + vshufi32x4 m14, m24, q2020 ; 2 + vshufi32x4 m17, m15, m28, q3131 ; 14 + vshufi32x4 m15, m28, q2020 ; 6 + vshufi32x4 m20, m18, m25, q3131 ; 26 + vshufi32x4 m18, m25, q2020 ; 18 + vshufi32x4 m21, m19, m29, q3131 ; 30 + vshufi32x4 m19, m29, q2020 ; 22 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova m22, [r4-64*20] + mova m26, [r4-64*16] + mova m23, [r4-64*19] + mova m27, [r4-64*15] + mova m24, [r4-64*18] + mova m28, [r4-64*14] + mova m25, [r4-64*17] + mova m29, [r4-64*13] + mova [r4-64*20], m14 + mova [r4-64*19], m15 + mova [r4-64*18], m16 + mova [r4-64*17], m17 + mova [r4-64*16], m18 + mova [r4-64*15], m19 + mova [r4-64*14], m20 + mova [r4-64*13], m21 + mova m19, [r4-64*12] + mova m11, [r4-64* 8] + mova m20, [r4-64*11] + mova m12, [r4-64* 7] + mova m21, [r4-64*10] + mova m8, [r4-64* 6] + mova m9, [r4-64* 9] + mova m18, [r4-64* 5] + vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13 + vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11 + vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33 + vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31 + vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53 + vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51 + vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73 + vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71 + vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11 + vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13 + vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31 + vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33 + vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51 + vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53 + vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71 + vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73 + vshufi32x4 m26, m22, m27, q3131 ; 9 + vshufi32x4 m22, m27, q2020 ; 1 + vshufi32x4 m27, m23, m28, q3131 ; 11 + vshufi32x4 m23, m28, q2020 ; 3 + vshufi32x4 m28, m24, m29, q3131 ; 13 + vshufi32x4 m24, m29, q2020 ; 5 + vshufi32x4 m29, m25, m8, q3131 ; 15 + vshufi32x4 m25, m8, q2020 ; 7 + vshufi32x4 m18, m14, m19, q3131 ; 25 + vshufi32x4 m14, m19, q2020 ; 17 + vshufi32x4 m19, m15, m20, q3131 ; 27 + vshufi32x4 m15, m20, q2020 ; 19 + vshufi32x4 m20, m16, m21, q3131 ; 29 + vshufi32x4 m16, m21, q2020 ; 21 + vshufi32x4 m21, m17, m9, q3131 ; 31 + vshufi32x4 m17, m9, q2020 ; 23 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf + jmp .end +.fast: ; bottom/right halves are zero + {evex}vpmulhrsw ym8, ym23, [cq+64* 4] + {evex}vpmulhrsw xm1, xm23, [cq+64*12] + mova m28, [o(dup16_perm)] + {evex}vpmulhrsw ym7, ym23, [cq+64* 8] + vpmulhrsw ym22, ym23, [cq+64* 0] + vpermb m8, m28, m8 + vpermb ym1, ym28, ym1 + vpermb m7, m28, m7 + pmovzxwd m9, ym22 + pslld m9, 16 + call m(idct_16x16_internal_8bpc).main_fast2 + {evex}vpmulhrsw ym21, ym23, [cq+64* 2] + {evex}vpmulhrsw xm15, xm23, [cq+64*14] + {evex}vpmulhrsw xm18, xm23, [cq+64*10] + {evex}vpmulhrsw ym14, ym23, [cq+64* 6] + vpermb m21, m28, m21 + punpcklwd xm15, xm15 + vpermb ym18, ym28, ym18 + vpermb m14, m28, m14 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + vpmulhrsw ym22, ym23, [cq+64* 1] + {evex}vpmulhrsw xm29, xm23, [cq+64*15] + {evex}vpmulhrsw xm26, xm23, [cq+64* 9] + {evex}vpmulhrsw ym25, ym23, [cq+64* 7] + {evex}vpmulhrsw ym24, ym23, [cq+64* 5] + {evex}vpmulhrsw xm27, xm23, [cq+64*11] + {evex}vpmulhrsw xm8, xm23, [cq+64*13] + {evex}vpmulhrsw ym23, [cq+64* 3] + vpermb m22, m28, m22 + punpcklwd xm29, xm29 + vpermb ym26, ym28, ym26 + vpermb m25, m28, m25 + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + REPX {vpermb x, m28, x}, m24, m27, m23 + punpcklwd xm28, xm8, xm8 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + mov r4, rsp + vpbroadcastd m13, [o(pw_16384)] + mova [r4+64*16], m4 + mova [r4+64*17], m5 + mova [r4+64*18], m6 + mova [r4+64*19], m7 + mova [r4+64*28], m26 + mova [r4+64*29], m27 + mova [r4+64*30], m28 + mova [r4+64*31], m29 + call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end + mova [r4+64*20], m22 + mova [r4+64*21], m23 + mova [r4+64*22], m24 + mova [r4+64*23], m25 + mova [r4+64*24], m26 + mova [r4+64*25], m27 + mova [r4+64*26], m28 + mova [r4+64*27], m29 + call .pass2_fast + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + mova [cq+64* 0], m0 + mova [cq+64* 1], m1 + mova [cq+64* 2], m2 + mova [cq+64* 3], m3 + mova [cq+64* 4], m4 + mova [cq+64* 5], m5 + mova [cq+64* 6], m6 + mova [cq+64* 7], m7 + pmulhrsw m0, m13, [r4+64*16] + pmulhrsw m1, m13, [r4+64*17] + pmulhrsw m2, m13, [r4+64*18] + pmulhrsw m3, m13, [r4+64*19] + pmulhrsw m4, m13, [r4+64*20] + pmulhrsw m5, m13, [r4+64*21] + pmulhrsw m6, m13, [r4+64*22] + pmulhrsw m7, m13, [r4+64*23] + mova [cq+64*16], m14 + mova [cq+64*17], m15 + mova [cq+64*18], m16 + mova [cq+64*19], m17 + mova [cq+64*20], m18 + mova [cq+64*21], m19 + mova [cq+64*22], m20 + mova [cq+64*23], m21 + pmulhrsw m14, m13, [r4+64*24] + pmulhrsw m15, m13, [r4+64*25] + pmulhrsw m16, m13, [r4+64*26] + pmulhrsw m17, m13, [r4+64*27] + pmulhrsw m18, m13, [r4+64*28] + pmulhrsw m19, m13, [r4+64*29] + pmulhrsw m20, m13, [r4+64*30] + pmulhrsw m21, m13, [r4+64*31] + mova [cq+64*24], m22 + mova [cq+64*25], m23 + mova [cq+64*26], m24 + mova [cq+64*27], m25 + mova [cq+64*28], m26 + mova [cq+64*29], m27 + mova [cq+64*30], m28 + mova [cq+64*31], m29 + call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round + call .pass2_fast + mova [r4+64*16], m14 + mova [r4+64*17], m15 + mova [r4+64*18], m16 + mova [r4+64*19], m17 + mova [r4+64*20], m18 + mova [r4+64*21], m19 + mova [r4+64*22], m20 + mova [r4+64*23], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast +.end: + vpbroadcastd m13, [o(pw_2048)] + lea r5, [strideq*3] + pxor m12, m12 + lea r3, [dstq+r5*8] + lea r6, [strideq+r5] ; stride*4 + add r3, r6 ; dst+stride*28 +%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi + mova m11, [cq+64*( %3)] ; 0 + mova m9, [cq+64*(31-%3)] ; 31 +%if %3 >= 8 + mova m%1, [rsp+64*(%1+16)] +%endif + mova m10, [dstq+%4] + paddsw m8, m11, m9 + psubsw m11, m9 + paddsw m9, m%1, m%2 + psubsw m%1, m%2 + punpcklbw m%2, m10, m12 + punpckhbw m10, m12 + pmulhrsw m8, m13 + pmulhrsw m9, m13 + paddw m8, m%2 + paddw m9, m10 + mova m10, [r3+%5] + pmulhrsw m11, m13 + pmulhrsw m%1, m13 + mova [cq+64*( %3)], m12 + mova [cq+64*(31-%3)], m12 + punpcklbw m%2, m10, m12 + punpckhbw m10, m12 + packuswb m8, m9 + paddw m11, m%2 + paddw m%1, m10 + packuswb m11, m%1 + mova [dstq+%4], m8 + mova [r3 +%5], m11 +%if %3 == 3 || %3 == 7 || %3 == 11 + add dstq, r6 + sub r3, r6 +%endif +%endmacro + IDCT_64x32_END 0, 29, 0, strideq*0, r5 + IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2 + IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1 + IDCT_64x32_END 3, 26, 3, r5 , strideq*0 + IDCT_64x32_END 4, 25, 4, strideq*0, r5 + IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2 + IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1 + IDCT_64x32_END 7, 22, 7, r5 , strideq*0 + IDCT_64x32_END 0, 21, 8, strideq*0, r5 + IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2 + IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1 + IDCT_64x32_END 3, 18, 11, r5 , strideq*0 + IDCT_64x32_END 4, 17, 12, strideq*0, r5 + IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2 + IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1 + IDCT_64x32_END 7, 14, 15, r5 , strideq*0 + RET +ALIGN function_align +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 32 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2 +ALIGN function_align +.pass1_end_part1: +%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64 +%if %1 != %3 + mova m%1, [cq+64*%1] +%endif + mova m9, [r4+64*(%3-36)] ; idct64 32+n + mova m11, [r4+64*(-5-%3)] ; idct64 63-n + psubsw m8, m%1, m%2 ; idct32 31-n + paddsw m%1, m%2 ; idct32 0+n +%if %1 == %3 + psubsw m%2, m8, m9 ; out 32+n e + paddsw m8, m9 ; out 31-n d + psubsw m9, m%1, m11 ; out 63-n h + paddsw m%1, m11 ; out 0+n a +%else + paddsw m%2, m8, m9 ; out 23-n c + psubsw m8, m9 ; out 40+n f + paddsw m9, m%1, m11 ; out 8+n b + psubsw m%1, m11 ; out 55-n g +%endif + mova [r4+64*(%3-36)], m8 + mova [r4+64*(-5-%3)], m9 +%endmacro + IDCT_64x32_PASS1_END 0, 29, 0 + IDCT_64x32_PASS1_END 1, 28, 1 + IDCT_64x32_PASS1_END 2, 27, 2 + IDCT_64x32_PASS1_END 3, 26, 3 + IDCT_64x32_PASS1_END 4, 25, 4 + IDCT_64x32_PASS1_END 5, 24, 5 + IDCT_64x32_PASS1_END 6, 23, 6 + IDCT_64x32_PASS1_END 7, 22, 7 +.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted) + punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3 + punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7 + punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3 + punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7 + punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3 + punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7 + punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5 + punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7 + punpckhqdq m23, m22, m27 ; 1 23 + punpcklqdq m22, m27 ; 0 22 + punpckhqdq m27, m26, m28 ; 5 27 + punpcklqdq m26, m28 ; 4 26 + punpcklqdq m28, m29, m25 ; 6 28 + punpckhqdq m29, m25 ; 7 29 + punpckhqdq m25, m24, m8 ; 3 25 + punpcklqdq m24, m8 ; 2 24 +.transpose_8x8: + punpckhwd m8, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m8, m1 + punpckhdq m8, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + ret +.pass1_end_part2: + IDCT_64x32_PASS1_END 0, 21, 8 + IDCT_64x32_PASS1_END 1, 20, 9 + IDCT_64x32_PASS1_END 2, 19, 10 + IDCT_64x32_PASS1_END 3, 18, 11 + IDCT_64x32_PASS1_END 4, 17, 12 + IDCT_64x32_PASS1_END 5, 16, 13 + IDCT_64x32_PASS1_END 6, 15, 14 + IDCT_64x32_PASS1_END 7, 14, 15 +.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21 + punpcklwd m8, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m7, m6 + punpckhwd m7, m6 + punpcklwd m6, m5, m4 + punpckhwd m5, m4 + punpckldq m4, m7, m5 + punpckhdq m7, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m6 + punpckldq m0, m6 + punpckldq m6, m3, m1 + punpckhdq m3, m1 + punpckhqdq m1, m0, m5 + punpcklqdq m0, m5 + punpckhqdq m5, m4, m6 + punpcklqdq m4, m6 + punpcklqdq m6, m7, m3 + punpckhqdq m7, m3 + punpckhqdq m3, m2, m8 + punpcklqdq m2, m8 + punpckhwd m8, m18, m19 + punpcklwd m18, m19 + punpckhwd m19, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m20, m21 + punpcklwd m20, m21 + punpckhwd m21, m16, m17 + punpcklwd m16, m17 + punpckhdq m17, m14, m16 + punpckldq m14, m16 + punpckldq m16, m18, m20 + punpckhdq m18, m20 + punpckhdq m20, m19, m21 + punpckldq m19, m21 + punpckldq m21, m8, m15 + punpckhdq m8, m15 + punpckhqdq m15, m14, m16 + punpcklqdq m14, m16 + punpcklqdq m16, m17, m18 + punpckhqdq m17, m18 + punpcklqdq m18, m19, m21 + punpckhqdq m19, m21 + punpckhqdq m21, m20, m8 + punpcklqdq m20, m8 + ret +.pass2_fast: + vshufi32x4 m24, m9, m15, q3131 ; 5 + vshufi32x4 m22, m9, m15, q2020 ; 1 + vshufi32x4 m15, m1, m16, q3131 ; 6 + vshufi32x4 m14, m1, m16, q2020 ; 2 + vshufi32x4 m1, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m8, m2, q3131 ; 12 + vshufi32x4 m2, m8, m2, q2020 ; 8 + vshufi32x4 m25, m11, m17, q3131 ; 7 + vshufi32x4 m23, m11, m17, q2020 ; 3 + vshufi32x4 m17, m5, m19, q3131 ; 14 + vshufi32x4 m16, m5, m19, q2020 ; 10 + vshufi32x4 m29, m6, m20, q3131 ; 15 + vshufi32x4 m27, m6, m20, q2020 ; 11 + vshufi32x4 m28, m4, m18, q3131 ; 13 + vshufi32x4 m26, m4, m18, q2020 ; 9 + jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob +%undef cmp + cmp eobd, 136 + jb .fast + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + vpbroadcastd m10, [o(pd_2048)] + mov r4, rsp + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, [cq+64* 7] + mova m1, [cq+64*25] + mova m2, [cq+64*23] + mova m3, [cq+64* 9] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, [cq+64* 3] + mova m1, [cq+64*29] + mova m2, [cq+64*19] + mova m3, [cq+64*13] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m2, [cq+64*16] + mova m3, [cq+64*24] + mova m14, [cq+64* 4] + mova m15, [cq+64*12] + mova m16, [cq+64*20] + mova m17, [cq+64*28] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova m22, [cq+64* 2] + mova m29, [cq+64*30] + mova m26, [cq+64*18] + mova m25, [cq+64*14] + mova m24, [cq+64*10] + mova m27, [cq+64*22] + mova m28, [cq+64*26] + mova m23, [cq+64* 6] + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1 + mova [r4+64*36], m1 + mova [r4+64*37], m3 + mova [r4+64*38], m5 + mova [r4+64*39], m7 + mova [r4+64*44], m23 + mova [r4+64*45], m25 + mova [r4+64*46], m27 + mova [r4+64*47], m29 + pmulhrsw m23, m13, m0 ; a0 + pmulhrsw m25, m13, m2 ; a2 + pmulhrsw m27, m13, m4 ; a4 + pmulhrsw m29, m13, m6 ; a6 + call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2 + lea r6, [r4-64*4] + add r4, 64*28 + call .pass2_end + mov r4, rsp + mova m0, [r4+64*23] + mova m1, [r4+64*22] + mova m2, [r4+64*21] + mova m3, [r4+64*20] + mova m4, [r4+64*19] + mova m5, [r4+64*18] + mova m6, [r4+64*17] + mova m7, [r4+64*16] + mova m22, [r4+64*15] + mova m23, [r4+64*14] + mova m24, [r4+64*13] + mova m25, [r4+64*12] + mova m26, [r4+64*11] + mova m27, [r4+64*10] + mova m28, [r4+64* 9] + mova m29, [r4+64* 8] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi + vpbroadcastd m13, [o(pw_8192)] + mova [r4+64* 8], m1 + mova [r4+64* 9], m3 + mova [r4+64*10], m5 + mova [r4+64*11], m7 + mova [r4+64*16], m23 + mova [r4+64*17], m25 + mova [r4+64*18], m27 + mova [r4+64*19], m29 + pmulhrsw m23, m13, m0 ; b0 + pmulhrsw m25, m13, m2 ; b2 + pmulhrsw m27, m13, m4 ; b4 + pmulhrsw m29, m13, m6 ; b6 + mova m0, [r4+64*31] + mova m1, [r4+64*30] + mova m2, [r4+64*29] + mova m3, [r4+64*28] + mova m4, [r4+64*27] + mova m5, [r4+64*26] + mova m6, [r4+64*25] + mova m7, [r4+64*24] + mova m14, [r4+64* 7] + mova m15, [r4+64* 6] + mova m16, [r4+64* 5] + mova m17, [r4+64* 4] + mova m18, [r4+64* 3] + mova m19, [r4+64* 2] + mova m20, [r4+64* 1] + mova m21, [r4+64* 0] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo + mov r6, cq + call .pass2_end + jmp .end +.fast: ; bottom/right halves are zero + mova m28, [o(dup16_perm)] + pmovzxwd m9, [cq+64* 0] + vpermb m8, m28, [cq+64* 4] + vpermb ym1, ym28, [cq+64*12] + vpermb m7, m28, [cq+64* 8] + pslld m9, 16 + call m(idct_16x16_internal_8bpc).main_fast2 + vpermb m21, m28, [cq+64* 2] + vpermb ym15, ym28, [cq+64*14] + vpermb ym18, ym28, [cq+64*10] + vpermb m14, m28, [cq+64* 6] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + vpermb m22, m28, [cq+64* 1] + vpermb ym29, ym28, [cq+64*15] + vpermb ym26, ym28, [cq+64* 9] + vpermb m25, m28, [cq+64* 7] + vpermb m24, m28, [cq+64* 5] + vpermb ym27, ym28, [cq+64*11] + vpermb m23, m28, [cq+64* 3] + vpermb ym28, ym28, [cq+64*13] + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_8192)] + mova [cq+64*16], m4 + mova [cq+64*17], m5 + mova [cq+64*18], m6 + mova [cq+64*19], m7 + mova [cq+64*28], m26 + mova [cq+64*29], m27 + mova [cq+64*30], m28 + mova [cq+64*31], m29 + call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end + mova [cq+64*20], m22 + mova [cq+64*21], m23 + mova [cq+64*22], m24 + mova [cq+64*23], m25 + mova [cq+64*24], m26 + mova [cq+64*25], m27 + mova [cq+64*26], m28 + mova [cq+64*27], m29 + lea r4, [rsp+64*64] + lea r3, [rsp+64*32] + call .pass2_fast + pmulhrsw m0, m13, [cq+64*16] + pmulhrsw m1, m13, [cq+64*17] + pmulhrsw m2, m13, [cq+64*18] + pmulhrsw m3, m13, [cq+64*19] + pmulhrsw m4, m13, [cq+64*20] + pmulhrsw m5, m13, [cq+64*21] + pmulhrsw m6, m13, [cq+64*22] + pmulhrsw m7, m13, [cq+64*23] + pmulhrsw m14, m13, [cq+64*24] + pmulhrsw m15, m13, [cq+64*25] + pmulhrsw m16, m13, [cq+64*26] + pmulhrsw m17, m13, [cq+64*27] + pmulhrsw m18, m13, [cq+64*28] + pmulhrsw m19, m13, [cq+64*29] + pmulhrsw m20, m13, [cq+64*30] + pmulhrsw m21, m13, [cq+64*31] + call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round + mov r4, rsp + mov r3, cq + call .pass2_fast +.end: + vpbroadcastd m17, [o(pw_2048)] + lea r5, [strideq*8] + mov r3, dstq + pxor m16, m16 + sub r4, 64*5 ; rsp+64*31 + mov r6, rsp +.end_loop: + mova m2, [r6+64*32] ; idct16 0+n lo + mova m7, [r6+64*48] ; idct32 31-n lo + mova m6, [cq+64* 0] ; idct16 0+n hi + mova m0, [cq+64*16] ; idct32 31-n hi + mova m4, [r4+64*64] ; idct64 63-n lo + mova m1, [r4+64* 0] ; idct64 63-n hi + mova m5, [r6+64*64] ; idct64 32+n lo + mova m8, [r6+64* 0] ; idct64 32+n hi + sub r3, strideq + paddsw m3, m2, m7 ; idct32 0+n lo + mova m12, [dstq+r5*0] + psubsw m2, m7 ; idct32 31-n lo + mova m15, [r3 +r5*8] + paddsw m7, m6, m0 ; idct32 0+n hi + mova m13, [r3 +r5*4] + psubsw m6, m0 ; idct32 31-n hi + mova m14, [dstq+r5*4] + paddsw m0, m3, m4 ; out 0+n lo + add r6, 64 + psubsw m3, m4 ; out 63-n lo + sub r4, 64 + paddsw m4, m7, m1 ; out 0+n hi + mova [cq+64* 0], m16 + psubsw m7, m1 ; out 63-n hi + mova [cq+64*16], m16 + paddsw m1, m2, m5 ; out 31-n lo + add cq, 64 + psubsw m2, m5 ; out 32+n lo + paddsw m5, m6, m8 ; out 31-n hi + psubsw m6, m8 ; out 32+n hi + pmulhrsw m0, m17 + punpcklbw m8, m12, m16 + pmulhrsw m4, m17 + punpckhbw m12, m16 + pmulhrsw m3, m17 + punpcklbw m11, m15, m16 + pmulhrsw m7, m17 + punpckhbw m15, m16 + pmulhrsw m1, m17 + punpcklbw m9, m13, m16 + pmulhrsw m5, m17 + punpckhbw m13, m16 + pmulhrsw m2, m17 + punpcklbw m10, m14, m16 + pmulhrsw m6, m17 + punpckhbw m14, m16 + paddw m0, m8 + paddw m4, m12 + packuswb m0, m4 + paddw m3, m11 + paddw m7, m15 + packuswb m3, m7 + paddw m1, m9 + paddw m5, m13 + packuswb m1, m5 + paddw m2, m10 + paddw m6, m14 + packuswb m2, m6 + mova [dstq+r5*0], m0 + mova [r3 +r5*8], m3 + mova [r3 +r5*4], m1 + mova [dstq+r5*4], m2 + add dstq, strideq + cmp r6, r4 + jb .end_loop + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly +ALIGN function_align +.pass2_end: + REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6 + mova [r4+64*20], m1 + mova [r4+64*21], m3 + mova [r4+64*22], m5 + mova [r4+64*23], m7 + vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01 + vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03 + vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01 + vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03 + mova [r4+64*12], m15 + mova [r4+64*13], m17 + mova [r4+64*14], m19 + mova [r4+64*15], m21 + vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41 + vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43 + vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 + vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43 + vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21 + vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23 + vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21 + vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23 + vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61 + vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 + vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63 + vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61 + vshufi32x4 m0, m1, m5, q2020 ; 0 + vshufi32x4 m1, m5, q3131 ; 8 + vshufi32x4 m2, m3, m14, q2020 ; 16 + vshufi32x4 m3, m14, q3131 ; 24 + vshufi32x4 m14, m15, m18, q2020 ; 4 + vshufi32x4 m15, m18, q3131 ; 12 + vshufi32x4 m16, m17, m19, q2020 ; 20 + vshufi32x4 m17, m19, q3131 ; 28 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + vshufi32x4 m24, m22, m25, q3131 ; 10 + vshufi32x4 m22, m25, q2020 ; 2 + vshufi32x4 m25, m23, m28, q3131 ; 14 + vshufi32x4 m23, m28, q2020 ; 6 + vshufi32x4 m28, m26, m27, q3131 ; 26 + vshufi32x4 m26, m27, q2020 ; 18 + vshufi32x4 m27, m29, m13, q2020 ; 22 + vshufi32x4 m29, m13, q3131 ; 30 + mova [r6+64* 0], m0 + mova [r6+64* 1], m1 + mova [r6+64* 2], m2 + mova [r6+64* 3], m3 + mova [r6+64* 4], m4 + mova [r6+64* 5], m5 + mova [r6+64* 6], m6 + mova [r6+64* 7], m7 + mova [r6+64* 8], m14 + mova [r6+64* 9], m15 + mova [r6+64*10], m16 + mova [r6+64*11], m17 + mova [r6+64*12], m18 + mova [r6+64*13], m19 + mova [r6+64*14], m20 + mova [r6+64*15], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_8192)] + mova [r6+64*16], m29 + mova [r6+64*17], m28 + mova [r6+64*18], m27 + mova [r6+64*19], m26 + mova [r6+64*20], m25 + mova [r6+64*21], m24 + mova [r6+64*22], m23 + mova [r6+64*23], m22 + mova [r6+64*24], m21 + mova [r6+64*25], m20 + mova [r6+64*26], m19 + mova [r6+64*27], m18 + mova [r6+64*28], m17 + mova [r6+64*29], m16 + mova [r6+64*30], m15 + mova [r6+64*31], m14 + pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25 + pmulhrsw m16, m13, [r4+64*12] + pmulhrsw m17, m13, [r4+64*16] + pmulhrsw m18, m13, [r4+64*20] + pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31 + pmulhrsw m20, m13, [r4+64*15] + pmulhrsw m21, m13, [r4+64*19] + pmulhrsw m22, m13, [r4+64*23] + vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9 + vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25 + vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9 + vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25 + pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29 + pmulhrsw m24, m13, [r4+64*14] + pmulhrsw m25, m13, [r4+64*18] + pmulhrsw m26, m13, [r4+64*22] + vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15 + vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31 + vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15 + vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31 + pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27 + pmulhrsw m28, m13, [r4+64*13] + pmulhrsw m29, m13, [r4+64*17] + pmulhrsw m13, [r4+64*21] + vshufi32x4 m0, m14, m16, q2020 ; 1 + vshufi32x4 m1, m19, m21, q3131 ; 31 + vshufi32x4 m2, m15, m17, q2020 ; 17 + vshufi32x4 m3, m18, m20, q3131 ; 15 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + vshufi32x4 m0, m18, m20, q2020 ; 7 + vshufi32x4 m1, m15, m17, q3131 ; 25 + vshufi32x4 m2, m19, m21, q2020 ; 23 + vshufi32x4 m3, m14, m16, q3131 ; 9 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13 + vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29 + vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13 + vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29 + vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11 + vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27 + vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11 + vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27 + vshufi32x4 m0, m22, m24, q2020 ; 5 + vshufi32x4 m1, m27, m29, q3131 ; 27 + vshufi32x4 m2, m23, m25, q2020 ; 21 + vshufi32x4 m3, m26, m28, q3131 ; 11 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + vshufi32x4 m0, m26, m28, q2020 ; 3 + vshufi32x4 m1, m23, m25, q3131 ; 29 + vshufi32x4 m2, m27, m29, q2020 ; 19 + vshufi32x4 m3, m22, m24, q3131 ; 13 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 +ALIGN function_align +.pass2_fast: + vshufi32x4 m23, m1, m16, q3131 ; 6 + vshufi32x4 m22, m1, m16, q2020 ; 2 + vshufi32x4 m14, m0, m3, q3131 ; 4 + vshufi32x4 m26, m0, m3, q2020 ; 0 + vshufi32x4 m28, m9, m15, q3131 ; 5 + vshufi32x4 m0, m9, m15, q2020 ; 1 + vshufi32x4 m16, m11, m17, q3131 ; 7 + vshufi32x4 m29, m11, m17, q2020 ; 3 + vshufi32x4 m15, m8, m2, q3131 ; 12 + vshufi32x4 m27, m8, m2, q2020 ; 8 + vshufi32x4 m25, m5, m19, q3131 ; 14 + vshufi32x4 m24, m5, m19, q2020 ; 10 + vshufi32x4 m3, m6, m20, q3131 ; 15 + vshufi32x4 m19, m6, m20, q2020 ; 11 + vshufi32x4 m17, m4, m18, q3131 ; 13 + vshufi32x4 m18, m4, m18, q2020 ; 9 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m16 + mova m3, m18 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m28 + mova m3, m19 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m29 + mova m3, m17 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + mova m0, m26 + mova m1, m27 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 + mova [r3+64* 0], m0 + mova [r3+64* 1], m1 + mova [r3+64* 2], m2 + mova [r3+64* 3], m3 + mova [r3+64* 4], m4 + mova [r3+64* 5], m5 + mova [r3+64* 6], m6 + mova [r3+64* 7], m7 + mova [r3+64* 8], m14 + mova [r3+64* 9], m15 + mova [r3+64*10], m16 + mova [r3+64*11], m17 + mova [r3+64*12], m18 + mova [r3+64*13], m19 + mova [r3+64*14], m20 + mova [r3+64*15], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 + mova [r3+64*16], m29 + mova [r3+64*17], m28 + mova [r3+64*18], m27 + mova [r3+64*19], m26 + mova [r3+64*20], m25 + mova [r3+64*21], m24 + mova [r3+64*22], m23 + mova [r3+64*23], m22 + mova [r3+64*24], m21 + mova [r3+64*25], m20 + mova [r3+64*26], m19 + mova [r3+64*27], m18 + mova [r3+64*28], m17 + mova [r3+64*29], m16 + mova [r3+64*30], m15 + mova [r3+64*31], m14 + ret + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/itx_sse.asm b/third_party/dav1d/src/x86/itx_sse.asm new file mode 100644 index 0000000000..ec7e3a52f4 --- /dev/null +++ b/third_party/dav1d/src/x86/itx_sse.asm @@ -0,0 +1,6533 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + + +SECTION_RODATA 16 + +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 + +%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 +pw_%1_m%2: times 4 dw %1, -%2 +%if %3 != 2 +pw_%2_%1: times 4 dw %2, %1 +%endif +%if %3 +pw_m%1_m%2: times 4 dw -%1, -%2 +%endif +%endmacro + +;adst4 +pw_1321_3803: times 4 dw 1321, 3803 +pw_2482_m1321: times 4 dw 2482, -1321 +pw_3344_2482: times 4 dw 3344, 2482 +pw_3344_m3803: times 4 dw 3344, -3803 +pw_3344_m3344: times 4 dw 3344, -3344 +pw_0_3344 times 4 dw 0, 3344 +pw_m6688_m3803: times 4 dw -6688, -3803 + +COEF_PAIR 2896, 2896 +COEF_PAIR 1567, 3784 +COEF_PAIR 799, 4017 +COEF_PAIR 3406, 2276 +COEF_PAIR 401, 4076 +COEF_PAIR 1931, 3612 +COEF_PAIR 3166, 2598 +COEF_PAIR 3920, 1189 +COEF_PAIR 3784, 1567, 1 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 3513, 2106 +COEF_PAIR 3857, 1380 +COEF_PAIR 4017, 799, 1 +COEF_PAIR 201, 4091 +COEF_PAIR 2440, 3290 +COEF_PAIR 3035, 2751 +COEF_PAIR 4052, 601 +COEF_PAIR 2276, 3406, 1 +COEF_PAIR 4076, 401, 2 +COEF_PAIR 2598, 3166, 2 +COEF_PAIR 3612, 1931, 2 +COEF_PAIR 1189, 3920, 2 + +pd_2048: times 4 dd 2048 +pw_2048: times 8 dw 2048 +pw_m2048: times 8 dw -2048 +pw_4096: times 8 dw 4096 +pw_16384: times 8 dw 16384 +pw_m16384: times 8 dw -16384 +pw_1697x16: times 8 dw 1697*16 +pw_1697x8: times 8 dw 1697*8 +pw_2896x8: times 8 dw 2896*8 +pw_3344x8: times 8 dw 3344*8 +pw_8192: times 8 dw 8192 +pw_m8192: times 8 dw -8192 +pw_5: times 8 dw 5 +pw_201x8: times 8 dw 201*8 +pw_4091x8: times 8 dw 4091*8 +pw_m2751x8: times 8 dw -2751*8 +pw_3035x8: times 8 dw 3035*8 +pw_1751x8: times 8 dw 1751*8 +pw_3703x8: times 8 dw 3703*8 +pw_m1380x8: times 8 dw -1380*8 +pw_3857x8: times 8 dw 3857*8 +pw_995x8: times 8 dw 995*8 +pw_3973x8: times 8 dw 3973*8 +pw_m2106x8: times 8 dw -2106*8 +pw_3513x8: times 8 dw 3513*8 +pw_2440x8: times 8 dw 2440*8 +pw_3290x8: times 8 dw 3290*8 +pw_m601x8: times 8 dw -601*8 +pw_4052x8: times 8 dw 4052*8 + +pw_4095x8: times 8 dw 4095*8 +pw_101x8: times 8 dw 101*8 +pw_2967x8: times 8 dw 2967*8 +pw_m2824x8: times 8 dw -2824*8 +pw_3745x8: times 8 dw 3745*8 +pw_1660x8: times 8 dw 1660*8 +pw_3822x8: times 8 dw 3822*8 +pw_m1474x8: times 8 dw -1474*8 +pw_3996x8: times 8 dw 3996*8 +pw_897x8: times 8 dw 897*8 +pw_3461x8: times 8 dw 3461*8 +pw_m2191x8: times 8 dw -2191*8 +pw_3349x8: times 8 dw 3349*8 +pw_2359x8: times 8 dw 2359*8 +pw_4036x8: times 8 dw 4036*8 +pw_m700x8: times 8 dw -700*8 +pw_4065x8: times 8 dw 4065*8 +pw_501x8: times 8 dw 501*8 +pw_3229x8: times 8 dw 3229*8 +pw_m2520x8: times 8 dw -2520*8 +pw_3564x8: times 8 dw 3564*8 +pw_2019x8: times 8 dw 2019*8 +pw_3948x8: times 8 dw 3948*8 +pw_m1092x8: times 8 dw -1092*8 +pw_3889x8: times 8 dw 3889*8 +pw_1285x8: times 8 dw 1285*8 +pw_3659x8: times 8 dw 3659*8 +pw_m1842x8: times 8 dw -1842*8 +pw_3102x8: times 8 dw 3102*8 +pw_2675x8: times 8 dw 2675*8 +pw_4085x8: times 8 dw 4085*8 +pw_m301x8: times 8 dw -301*8 + +SECTION .text + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%if ARCH_X86_64 +%define o(x) x +%else +%define o(x) r5-$$+x ; PIC +%endif + +%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4] + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rotate 5 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + + movd m%3, [%%row_adr1] ;dst0 + movd m%5, [%%row_adr2] ;dst1 + punpckldq m%3, m%5 ;high: dst1 :low: dst0 + movd m%4, [%%row_adr3] ;dst2 + movd m%5, [%%row_adr4] ;dst3 + punpckldq m%4, m%5 ;high: dst3 :low: dst2 + + pxor m%5, m%5 + punpcklbw m%3, m%5 ;extend byte to word + punpcklbw m%4, m%5 ;extend byte to word + + paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 + paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 + + packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 + + movd [%%row_adr1], m%3 ;store dst0 + out0 + pshuflw m%4, m%3, q1032 + movd [%%row_adr2], m%4 ;store dst1 + out1 + punpckhqdq m%3, m%3 + movd [%%row_adr3], m%3 ;store dst2 + out2 + psrlq m%3, 32 + movd [%%row_adr4], m%3 ;store dst3 + out3 +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + mova m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + + WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4 + ret +%endmacro + +; flags: 1 = swap, 2: coef_regs, 4: no_pack +%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags +%if %6 & 2 + pmaddwd m%2, m%4, m%1 + pmaddwd m%1, m%5 +%elif %6 & 1 + pmaddwd m%2, m%1, [o(pw_%5_%4)] + pmaddwd m%1, [o(pw_%4_m%5)] +%else + pmaddwd m%2, m%1, [o(pw_%4_m%5)] + pmaddwd m%1, [o(pw_%5_%4)] +%endif + paddd m%2, m%3 + paddd m%1, m%3 + psrad m%2, 12 + psrad m%1, 12 +%if %6 & 4 == 0 + packssdw m%1, m%2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8 + mova m3, [o(pd_2048)] + punpckhwd m2, m0, m1 ;unpacked in1 in3 + punpcklwd m0, m1 ;unpacked in0 in2 + ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 + ITX_MUL2X_PACK 0, 1, 3, 2896, 2896 + psubsw m1, m0, m2 ;high: out2 ;low: out3 + paddsw m0, m2 ;high: out1 ;low: out0 +%endmacro + +%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2 + %define %%p1 m(i%1_%3_internal_8bpc) +%if ARCH_X86_32 + LEA r5, $$ +%endif +%if has_epilogue +%ifidn %1_%2, dct_dct + test eobd, eobd + jz %%end +%endif + lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] + call %%p1 + RET +%%end: +%else + lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4, 6 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd ;0 + pmulhrsw m0, m1 + mova m1, m0 + TAIL_CALL m(iadst_4x4_internal_8bpc).end2 +%endif +%endmacro + +INIT_XMM ssse3 +; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16. + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] ;high: in1 ;low: in0 + mova m1, [coeffq+16*1] ;high: in3 ;low in2 + + IDCT4_1D_PACKED + + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 ;high: in1 ;low: in0 + pshufb m1, m3, m2 ;high: in3 ;low :in2 + jmp tx2q + +.pass2: + IDCT4_1D_PACKED + + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); + + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + call .main + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 ;high: in3 ;low :in2 + punpcklwd m0, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + call .main + +.end: + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + +.end2: + ITX4_END 0, 1, 2, 3 + +ALIGN function_align +cglobal_label .main + punpcklwd m2, m0, m1 ;unpacked in0 in2 + punpckhwd m0, m1 ;unpacked in1 in3 + mova m3, m0 + pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 + pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 + paddd m1, m0 ;t2 + pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 + paddd m4, m0 ;t0 + t3 + pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + mova m0, [o(pd_2048)] + paddd m1, m0 ;t2 + 2048 + paddd m2, m0 + paddd m0, m4 ;t0 + t3 + 2048 + paddd m5, m2 ;t1 + t3 + 2048 + paddd m2, m4 + paddd m2, m3 ;t0 + t1 - t3 + 2048 + REPX {psrad x, 12}, m1, m0, m5, m2 + packssdw m0, m5 ;high: out1 ;low: out0 + packssdw m1, m2 ;high: out3 ;low: out3 + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + call m(iadst_4x4_internal_8bpc).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 ;high: in3 ;low :in2 + punpckhwd m1, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + call m(iadst_4x4_internal_8bpc).main + +.end: + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + mova m3, [o(pw_1697x8)] + pmulhrsw m2, m0, m3 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 ;high: in3 ;low :in2 + punpcklwd m0, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + mova m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8bpc).end + +%macro IWHT4_1D_PACKED 0 + punpckhqdq m3, m0, m1 ;low: in1 high: in3 + punpcklqdq m0, m1 ;low: in0 high: in2 + psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3 + paddw m0, m3 ;low: in0 + in1 high: in2 + in3 + punpckhqdq m2, m2 ;t2 t2 + punpcklqdq m0, m0 ;t0 t0 + psubw m1, m0, m2 + psraw m1, 1 ;t4 t4 + psubw m1, m3 ;low: t1/out2 high: t3/out1 + psubw m0, m1 ;high: out0 + paddw m2, m1 ;low: out3 +%endmacro + +INIT_XMM sse2 +cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + psraw m0, 2 + psraw m1, 2 + IWHT4_1D_PACKED + punpckhwd m0, m1 + punpcklwd m3, m1, m2 + punpckhdq m1, m0, m3 + punpckldq m0, m3 + IWHT4_1D_PACKED + shufpd m0, m2, 0x01 + ITX4_END 0, 3, 2, 1, 0 + +%macro IDCT8_1D_PACKED 0 + mova m6, [o(pd_2048)] + punpckhwd m4, m0, m3 ;unpacked in1 in7 + punpcklwd m0, m2 ;unpacked in0 in4 + punpckhwd m2, m1 ;unpacked in5 in3 + punpcklwd m1, m3 ;unpacked in2 in6 + ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a + ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a + ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2 + psubsw m3, m4, m2 ;low: t6a high: t5a + paddsw m4, m2 ;low: t7 high: t4 + pshufb m3, [o(deint_shuf1)] + ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1 + ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5 + psubsw m2, m0, m1 ;low: tmp3 high: tmp2 + paddsw m0, m1 ;low: tmp0 high: tmp1 + punpcklqdq m1, m4, m3 ;low: t7 high: t6 + punpckhqdq m4, m3 ;low: t4 high: t5 + psubsw m3, m0, m1 ;low: out7 high: out6 + paddsw m0, m1 ;low: out0 high: out1 + paddsw m1, m2, m4 ;low: out3 high: out2 + psubsw m2, m4 ;low: out4 high: out5 +%endmacro + +;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 + punpckhwd m%4, m%1, m%2 + punpcklwd m%1, m%2 +%if %7 < 8 + pmaddwd m%2, m%7, m%1 + pmaddwd m%3, m%7, m%4 +%else + mova m%2, [o(pw_%7_%6)] +%if %8 + pmaddwd m%3, m%1, m%2 + pmaddwd m%2, m%4 +%else + pmaddwd m%3, m%4, m%2 + pmaddwd m%2, m%1 +%endif +%endif + paddd m%3, m%5 + paddd m%2, m%5 + psrad m%3, 12 + psrad m%2, 12 +%if %8 + packssdw m%3, m%2 +%else + packssdw m%2, m%3 ;dst2 +%endif +%if %7 < 8 + pmaddwd m%4, m%6 + pmaddwd m%1, m%6 +%elif %8 + mova m%2, [o(pw_%6_m%7)] + pmaddwd m%4, m%2 + pmaddwd m%1, m%2 +%else + mova m%3, [o(pw_%6_m%7)] + pmaddwd m%4, m%3 + pmaddwd m%1, m%3 +%endif + paddd m%4, m%5 + paddd m%1, m%5 + psrad m%4, 12 + psrad m%1, 12 + packssdw m%1, m%4 ;dst1 +%endmacro + +%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 + ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 + psubsw m%3, m%1, m%2 ;out2 + paddsw m%2, m%1 ;out1 + paddsw m%1, m%5, m%4 ;out0 + psubsw m%4, m%5 ;out3 +%endmacro + +%macro WRITE_4X8 4 ;row[1-4] + WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 + lea dstq, [dstq+strideq*4] + WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4 +%endmacro + +%macro INV_4X8 0 + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m2 ;low: in2 high: in3 + punpckldq m0, m2 ;low: in0 high: in1 + punpckldq m2, m3, m4 ;low: in4 high: in5 + punpckhdq m3, m4 ;low: in6 high: in7 +%endmacro + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd + pmulhrsw m0, m1 + pmulhrsw m0, m1 + pmulhrsw m0, [o(pw_2048)] + mova m1, m0 + mova m2, m0 + mova m3, m0 + TAIL_CALL m(iadst_4x8_internal_8bpc).end3 +%endif +%endmacro + +INIT_XMM ssse3 +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst +INV_TXFM_4X8_FN dct, identity + +cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(idct_8x4_internal_8bpc).main + jmp m(iadst_4x8_internal_8bpc).pass1_end + +.pass2: + call .main + shufps m1, m1, q1032 + shufps m3, m3, q1032 + mova m4, [o(pw_2048)] + jmp m(iadst_4x8_internal_8bpc).end2 + +ALIGN function_align +cglobal_label .main + IDCT8_1D_PACKED + ret + + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(iadst_8x4_internal_8bpc).main + +.pass1_end: + INV_4X8 + jmp tx2q + +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call .main + mova m4, [o(pw_2048)] + pxor m5, m5 + psubw m5, m4 + +.end: + punpcklqdq m4, m5 + +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + pxor m5, m5 + mova [coeffq+16*0], m5 + mova [coeffq+16*1], m5 + mova [coeffq+16*2], m5 + mova [coeffq+16*3], m5 + +.end3: + WRITE_4X8 0, 1, 2, 3 + RET + +ALIGN function_align +cglobal_label .main + mova m6, [o(pd_2048)] + punpckhwd m4, m3, m0 ;unpacked in7 in0 + punpckhwd m5, m2, m1 ;unpacked in5 in2 + punpcklwd m1, m2 ;unpacked in3 in4 + punpcklwd m0, m3 ;unpacked in1 in6 + ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a + ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a + ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a + ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a + + psubsw m3, m4, m1 ;low: t4 high: t5 + paddsw m4, m1 ;low: t0 high: t1 + psubsw m2, m5, m0 ;low: t6 high: t7 + paddsw m5, m0 ;low: t2 high: t3 + + shufps m1, m3, m2, q1032 + punpckhwd m2, m1 + punpcklwd m3, m1 + ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a + ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a + + psubsw m1, m4, m5 ;low: t2 high: t3 + paddsw m4, m5 ;low: out0 high: -out7 + psubsw m5, m3, m2 ;low: t7 high: t6 + paddsw m3, m2 ;low: out6 high: -out1 + shufps m0, m4, m3, q3210 ;low: out0 high: -out1 + shufps m3, m4, q3210 ;low: out6 high: -out7 + + mova m2, [o(pw_2896_m2896)] + mova m7, [o(pw_2896_2896)] + shufps m4, m1, m5, q1032 ;low: t3 high: t7 + shufps m1, m5, q3210 ;low: t2 high: t6 + punpcklwd m5, m1, m4 + punpckhwd m1, m4 + pmaddwd m4, m2, m1 ;-out5 + pmaddwd m2, m5 ; out4 + pmaddwd m1, m7 ; out2 + pmaddwd m5, m7 ;-out3 + REPX {paddd x, m6}, m4, m2, m1, m5 + REPX {psrad x, 12}, m4, m2, m1, m5 + packssdw m1, m5 ;low: out2 high: -out3 + packssdw m2, m4 ;low: out4 high: -out5 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(iadst_8x4_internal_8bpc).main + + punpcklwd m4, m3, m2 + punpckhwd m3, m2 + punpcklwd m5, m1, m0 + punpckhwd m1, m0 + punpckldq m2, m3, m1 ;low: in4 high: in5 + punpckhdq m3, m1 ;low: in6 high: in7 + punpckldq m0, m4, m5 ;low: in0 high: in1 + punpckhdq m1, m4, m5 ;low: in2 high: in3 + jmp tx2q + +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal_8bpc).main + + mova m4, m0 + mova m5, m1 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + pshufd m2, m5, q1032 + pshufd m3, m4, q1032 + mova m5, [o(pw_2048)] + pxor m4, m4 + psubw m4, m5 + jmp m(iadst_4x8_internal_8bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_4x8_internal_8bpc).pass1_end + +.pass2: + mova m4, [o(pw_4096)] + jmp m(iadst_4x8_internal_8bpc).end2 + + +%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] + movq m%3, [dstq ] + movq m%4, [dstq+strideq] + pxor m%5, m%5 + punpcklbw m%3, m%5 ;extend byte to word + punpcklbw m%4, m%5 ;extend byte to word +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + movq [dstq ], m%3 + punpckhqdq m%3, m%3 + movq [dstq+strideq], m%3 +%endmacro + +%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3] + WRITE_8X2 %1, %2, %5, %6, %7 + lea dstq, [dstq+strideq*2] + WRITE_8X2 %3, %4, %5, %6, %7 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + pmulhrsw m0, m1 + mova m2, [o(pw_2048)] + pmulhrsw m0, m1 + pmulhrsw m0, m2 + mova m1, m0 + mova m2, m0 + mova m3, m0 + TAIL_CALL m(iadst_8x4_internal_8bpc).end2 +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + call m(idct_4x8_internal_8bpc).main + + mova m4, [o(deint_shuf1)] + mova m5, [o(deint_shuf2)] + pshufb m0, m4 + pshufb m1, m5 + pshufb m2, m4 + pshufb m3, m5 + punpckhdq m4, m0, m1 + punpckldq m0, m1 + punpckhdq m5, m2, m3 + punpckldq m2, m3 + punpckhqdq m1, m0, m2 ;in1 + punpcklqdq m0, m2 ;in0 + punpckhqdq m3, m4, m5 ;in3 + punpcklqdq m2 ,m4, m5 ;in2 + jmp tx2q + +.pass2: + call .main + jmp m(iadst_8x4_internal_8bpc).end + +ALIGN function_align +cglobal_label .main + mova m6, [o(pd_2048)] + IDCT4_1D 0, 1, 2, 3, 4, 5, 6 + ret + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal_8bpc).main + + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + pxor m5, m5 + psubsw m3, m5, m1 + psubsw m5, m4 + punpckhdq m4, m5, m3 + punpckldq m5, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhwd m1, m0, m5 ;in1 + punpcklwd m0, m5 ;in0 + punpcklwd m2, m3, m4 ;in2 + punpckhwd m3, m4 ;in3 + jmp tx2q + +.pass2: + call .main + +.end: + mova m4, [o(pw_2048)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + +.end2: + pxor m6, m6 + mova [coeffq+16*0], m6 + mova [coeffq+16*1], m6 + mova [coeffq+16*2], m6 + mova [coeffq+16*3], m6 +.end3: + WRITE_8X4 0, 1, 2, 3, 4, 5, 6 + RET + +ALIGN function_align +cglobal_label .main + punpckhwd m6, m0, m2 ;unpacked in0 in2 + punpcklwd m0, m2 ;unpacked in0 in2 + punpckhwd m7, m1, m3 ;unpacked in1 in3 + punpcklwd m1, m3 ;unpacked in1 in3 + + mova m2, [o(pw_3344_m3344)] + mova m4, [o(pw_0_3344)] + pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 + pmaddwd m5, m4, m7 ;3344 * in3 + pmaddwd m2, m0 + pmaddwd m4, m1 + paddd m3, m5 + paddd m2, m4 + mova m4, [o(pd_2048)] + paddd m3, m4 ;t2 + 2048 + paddd m2, m4 + psrad m3, 12 + psrad m2, 12 + packssdw m2, m3 ;out2 + + pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 + paddd m3, m4 ;t0 + t3 + + pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + mova m4, [o(pd_2048)] + paddd m0, m4 + paddd m4, m3 ;t0 + t3 + 2048 + paddd m5, m0 ;t1 + t3 + 2048 + paddd m3, m0 + paddd m3, m1 ;t0 + t1 - t3 + 2048 + + psrad m4, 12 ;out0 + psrad m5, 12 ;out1 + psrad m3, 12 ;out3 + packssdw m0, m4, m5 ;low: out0 high: out1 + + pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 + paddd m1, m4 ;t0 + t3 + pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + + mova m4, [o(pd_2048)] + paddd m6, m4 + paddd m4, m1 ;t0 + t3 + 2048 + paddd m5, m6 ;t1 + t3 + 2048 + paddd m1, m6 + paddd m1, m7 ;t0 + t1 - t3 + 2048 + + psrad m4, 12 ;out0 + psrad m5, 12 ;out1 + psrad m1, 12 ;out3 + packssdw m3, m1 ;out3 + packssdw m4, m5 ;low: out0 high: out1 + + punpckhqdq m1, m0, m4 ;out1 + punpcklqdq m0, m4 ;out0 + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal_8bpc).main + + punpckhwd m5, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + + pxor m0, m0 + psubsw m4, m0, m2 + psubsw m0, m5 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m4, m3, m1 + punpckldq m3, m1 + punpckhwd m1, m0, m3 ;in1 + punpcklwd m0, m3 ;in0 + punpckhwd m3, m2, m4 ;in3 + punpcklwd m2, m4 ;in2 + jmp tx2q + +.pass2: + call m(iadst_8x4_internal_8bpc).main + mova m4, m0 + mova m5, m1 + mova m0, m3 + mova m1, m2 + mova m2, m5 + mova m3, m4 + jmp m(iadst_8x4_internal_8bpc).end + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + paddsw m0, m0 + paddsw m1, m1 + paddsw m2, m2 + paddsw m3, m3 + + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m5, m4, m1 + punpckldq m4, m1 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhwd m1, m0, m4 ;in1 + punpcklwd m0, m4 ;in0 + punpcklwd m2, m3, m5 ;in2 + punpckhwd m3, m5 ;in3 + jmp tx2q + +.pass2: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_8x4_internal_8bpc).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8, 8, 16*4 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mova m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m2 + psrlw m2, 3 + pmulhrsw m0, m1 + pmulhrsw m0, m2 +.end: + mov r3d, 2 + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)] +.loop: + WRITE_8X4 0, 0, 0, 0, 1, 2, 3 + lea dstq, [dstq+strideq*2] + dec r3d + jg .loop + jmp tx2q +.end3: + RET +%endif +%endmacro + +%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 +%if %3 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [%1+%2*0] + pmulhrsw m1, m7, [%1+%2*1] + pmulhrsw m2, m7, [%1+%2*2] + pmulhrsw m3, m7, [%1+%2*3] + pmulhrsw m4, m7, [%1+%2*4] + pmulhrsw m5, m7, [%1+%2*5] + pmulhrsw m6, m7, [%1+%2*6] + pmulhrsw m7, [%1+%2*7] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] + mova m7, [%1+%2*7] +%endif +%endmacro + +%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a + ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a + psubsw m%2, m%4, m%5 ;t6a + paddsw m%4, m%5 ;t7 + psubsw m%5, m%1, m%3 ;t5a + paddsw m%1, m%3 ;t4 + ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call .main + +.pass1_end: + mova m7, [o(pw_16384)] + +.pass1_end1: + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + +.pass1_end2: + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + +cglobal_label .pass1_end3 + punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 + punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 + punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 + punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 + punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 + punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 + punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 + punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 + punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 + punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 + mova [rsp+gprsize+16*2], m6 + mova m6, [rsp+gprsize+16*1] + punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 + punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 + punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 + punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 + punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 + punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 + + punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 + punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 + mova [rsp+gprsize+16*0], m2 + punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 + punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 + punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 + punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 + mova m7, [rsp+gprsize+16*2] + punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 + punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 + mova m7, [rsp+gprsize+16*0] + jmp tx2q + +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.pass2_main: + call .main + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + +.end2: + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + mova [rsp+gprsize+16*2], m5 + mova [rsp+gprsize+16*0], m7 + +.end3: + WRITE_8X4 0, 1, 2, 3, 5, 6, 7 + lea dstq, [dstq+strideq*2] + WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 + jmp tx2q + +.end4: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*0], m7 + mova [rsp+gprsize*2+16*1], m3 + mova [rsp+gprsize*2+16*2], m1 + mova m7, [o(pd_2048)] + IDCT4_1D 0, 2, 4, 6, 1, 3, 7 + mova m3, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m2 + mova m2, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*1], m4 + mova m4, [rsp+gprsize*2+16*0] + mova [rsp+gprsize*2+16*0], m6 + IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 + mova m6, [rsp+gprsize*2+16*0] + psubsw m7, m0, m4 ;out7 + paddsw m0, m4 ;out0 + mova [rsp+gprsize*2+16*0], m7 + mova m1, [rsp+gprsize*2+16*2] + psubsw m4, m6, m3 ;out4 + paddsw m3, m6 ;out3 + mova m7, [rsp+gprsize*2+16*1] + psubsw m6, m1, m5 ;out6 + paddsw m1, m5 ;out1 + psubsw m5, m7, m2 ;out5 + paddsw m2, m7 ;out2 + ret + + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call .main + call .main_pass1_end + +.pass1_end: + mova m7, [o(pw_16384)] + +.pass1_end1: + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + pxor m6, m6 + psubw m6, m7 + mova m7, m6 + jmp m(idct_8x8_internal_8bpc).pass1_end2 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.pass2_main: + call .main + call .main_pass2_end + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + pxor m6, m6 + psubw m6, m7 + mova m7, m6 + jmp m(idct_8x8_internal_8bpc).end2 + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*0], m7 + mova [rsp+gprsize*2+16*1], m3 + mova [rsp+gprsize*2+16*2], m4 + mova m7, [o(pd_2048)] + ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a + ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a + paddsw m3, m2, m6 ;t2 + psubsw m2, m6 ;t6 + paddsw m4, m5, m1 ;t3 + psubsw m5, m1 ;t7 + ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a + + mova m6, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m5 + mova m1, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*1], m2 + mova m5, [rsp+gprsize*2+16*0] + mova [rsp+gprsize*2+16*0], m3 + ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a + ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a + psubsw m2, m0, m6 ;t4 + paddsw m0, m6 ;t0 + paddsw m3, m5, m1 ;t1 + psubsw m5, m1 ;t5 + ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a + + mova m7, [rsp+gprsize*2+16*0] + paddsw m1, m3, m4 ;-out7 + psubsw m3, m4 ;t3 + mova [rsp+gprsize*2+16*0], m1 + psubsw m4, m0, m7 ;t2 + paddsw m0, m7 ;out0 + mova m6, [rsp+gprsize*2+16*2] + mova m7, [rsp+gprsize*2+16*1] + paddsw m1, m5, m6 ;-out1 + psubsw m5, m6 ;t6 + paddsw m6, m2, m7 ;out6 + psubsw m2, m7 ;t7 + ret +ALIGN function_align +.main_pass1_end: + mova [rsp+gprsize*2+16*1], m1 + mova [rsp+gprsize*2+16*2], m6 + punpckhwd m1, m4, m3 + punpcklwd m4, m3 + punpckhwd m7, m5, m2 + punpcklwd m5, m2 + mova m2, [o(pw_2896_2896)] + mova m6, [o(pd_2048)] + pmaddwd m3, m2, m7 + pmaddwd m2, m5 + paddd m3, m6 + paddd m2, m6 + psrad m3, 12 + psrad m2, 12 + packssdw m2, m3 ;out2 + mova m3, [o(pw_2896_m2896)] + pmaddwd m7, m3 + pmaddwd m5, m3 + paddd m7, m6 + paddd m5, m6 + psrad m7, 12 + psrad m5, 12 + packssdw m5, m7 ;-out5 + mova m3, [o(pw_2896_2896)] + pmaddwd m7, m3, m1 + pmaddwd m3, m4 + paddd m7, m6 + paddd m3, m6 + psrad m7, 12 + psrad m3, 12 + packssdw m3, m7 ;-out3 + mova m7, [o(pw_2896_m2896)] + pmaddwd m1, m7 + pmaddwd m4, m7 + paddd m1, m6 + paddd m4, m6 + psrad m1, 12 + psrad m4, 12 + packssdw m4, m1 ;-out5 + mova m1, [rsp+gprsize*2+16*1] + mova m6, [rsp+gprsize*2+16*2] + ret +ALIGN function_align +cglobal_label .main_pass2_end + paddsw m7, m4, m3 ;t2 + t3 + psubsw m4, m3 ;t2 - t3 + paddsw m3, m5, m2 ;t6 + t7 + psubsw m5, m2 ;t6 - t7 + mova m2, [o(pw_2896x8)] + pmulhrsw m4, m2 ;out4 + pmulhrsw m5, m2 ;-out5 + pmulhrsw m7, m2 ;-out3 + pmulhrsw m2, m3 ;out2 + mova m3, m7 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call m(iadst_8x8_internal_8bpc).main + call m(iadst_8x8_internal_8bpc).main_pass1_end + +.pass1_end: + mova m7, [o(pw_m16384)] + +.pass1_end1: + pmulhrsw m1, m7 + mova [rsp+gprsize+16*1], m1 + mova m1, m6 + mova m6, m2 + pmulhrsw m2, m5, m7 + mova m5, m6 + mova m6, m4 + pmulhrsw m4, m3, m7 + mova m3, m6 + mova m6, m0 + mova m0, m7 + pxor m7, m7 + psubw m7, m0 + pmulhrsw m0, [rsp+gprsize+16*0] + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.pass2_main: + call m(iadst_8x8_internal_8bpc).main + call m(iadst_8x8_internal_8bpc).main_pass2_end + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*2], m2 + mova m2, m0 + pxor m0, m0 + psubw m0, m7 + mova m7, m2 + pmulhrsw m1, m0 + pmulhrsw m2, m5, m0 + mova [rsp+gprsize+16*1], m1 + mova m5, m4 + mova m1, m6 + pmulhrsw m4, m3, m0 + pmulhrsw m0, [rsp+gprsize+16*0] + mova m3, m5 + mova [rsp+gprsize+16*0], m7 + jmp m(idct_8x8_internal_8bpc).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.end: + pmulhrsw m7, [o(pw_4096)] + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_4096)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + mova [rsp+gprsize+16*2], m5 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).end3 + + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd + pmulhrsw m0, [o(pw_16384)] + pmulhrsw m0, m1 + pmulhrsw m0, [o(pw_2048)] +.end: + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + RET +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst +INV_TXFM_4X16_FN dct, identity + +cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_4x8_internal_8bpc).pass1)] + +.pass1: + mova m0, [coeffq+16*1] + mova m1, [coeffq+16*3] + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*7] + push tx2q + lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)] + jmp r3 + +.pass1_2: + mova [coeffq+16*1], m0 + mova [coeffq+16*3], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*2] + mova m2, [coeffq+16*4] + mova m3, [coeffq+16*6] + lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)] + jmp r3 + +.pass1_end: + pop tx2q + + mova m4, [coeffq+16*1] + mova m5, [coeffq+16*3] + mova m6, [coeffq+16*5] + mova m7, [o(pw_16384)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*7], m7 + jmp tx2q + +.pass2: + call m(idct_16x4_internal_8bpc).main + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*4], m4 + +.end1: + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + mov r3, coeffq + WRITE_4X8 0, 1, 3, 2 + + mova m0, [r3+16*4] + mova m1, [r3+16*5] + mova m2, [r3+16*6] + mova m3, m7 + lea dstq, [dstq+strideq*4] + WRITE_4X8 0, 1, 3, 2 + +.end2: + pxor m7, m7 + REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)] + jmp m(idct_4x16_internal_8bpc).pass1 + +.pass2: + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass2_end + + punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 + punpckhqdq m4, m5 ;low: out8 high: out10 + punpcklqdq m5, m7, m2 ;low: out4 high: out6 + punpckhqdq m2, m7 ;low: -out9 high: -out11 + mova [coeffq+16*4], m2 + mova [coeffq+16*5], m6 + mova m2, [coeffq+16*6] + mova m6, [coeffq+16*7] + punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 + punpcklqdq m0, m6 ;low: out0 high: out2 + punpckhqdq m6, m3, m2 ;low: out12 high: out14 + punpcklqdq m2, m3 ;low: -out1 high: -out3 + + mova m7, [o(pw_2048)] + +.end1: + REPX {pmulhrsw x, m7}, m0, m5, m4, m6 + pxor m3, m3 + psubw m3, m7 + mova m7, [coeffq+16*4] + REPX {pmulhrsw x, m3}, m2, m7, m1 + pmulhrsw m3, [coeffq+16*5] + mova [coeffq+16*7], m5 + + punpckhqdq m5, m4, m7 ;low: out10 high: out11 + punpcklqdq m4, m7 ;low: out8 high: out9 + punpckhqdq m7, m6, m1 ;low: out14 high: out15 + punpcklqdq m6, m1 ;low: out12 high: out13 + punpckhqdq m1, m0, m2 ;low: out2 high: out3 + punpcklqdq m0, m2 ;low: out0 high: out1 + mova [coeffq+16*4], m4 + mova m4, [coeffq+16*7] + punpcklqdq m2, m4, m3 ;low: out4 high: out5 + punpckhqdq m4, m3 ;low: out6 high: out7 + mova m3, m4 + +.end2: + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + mov r3, coeffq + WRITE_4X8 0, 1, 2, 3 + + mova m0, [r3+16*4] + mova m1, [r3+16*5] + mova m2, [r3+16*6] + mova m3, m7 + lea dstq, [dstq+strideq*4] + WRITE_4X8 0, 1, 2, 3 + +.end3: + pxor m7, m7 + REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)] + jmp m(idct_4x16_internal_8bpc).pass1 + +.pass2: + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass2_end + + punpckhqdq m6, m5, m4 ;low: out5 high: out7 + punpcklqdq m4, m5 ;low: -out8 high: -out10 + punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 + punpcklqdq m2, m7 ;low: out9 high: out11 + mova [coeffq+16*4], m2 + mova [coeffq+16*5], m6 + mova m2, [coeffq+16*6] + mova m6, [coeffq+16*7] + punpcklqdq m1, m6, m0 ;low: out13 high: out15 + punpckhqdq m0, m6 ;low: -out0 high: -out2 + punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 + punpckhqdq m2, m3 ;low: out1 high: out3 + + mova m7, [o(pw_m2048)] + jmp m(iadst_4x16_internal_8bpc).end1 + + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*1] + mova m6, [o(pw_1697x8)] + mova m1, [coeffq+16*3] + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*7] + pcmpeqw m7, m7 + mov r3, tx2q + lea tx2q, [o(.pass1_2)] +.pass1: + pmulhrsw m4, m6, m0 + pmulhrsw m5, m6, m1 + pavgw m4, m0 + pcmpeqw m0, m7 + pavgw m5, m1 + pcmpeqw m1, m7 + pandn m0, m4 + pmulhrsw m4, m6, m2 + pandn m1, m5 + pmulhrsw m5, m6, m3 + pavgw m4, m2 + pcmpeqw m2, m7 + pavgw m5, m3 + pcmpeqw m3, m7 + pandn m2, m4 + pandn m3, m5 + jmp m(iadst_4x8_internal_8bpc).pass1_end +.pass1_2: + mova [coeffq+16*1], m0 + mova [coeffq+16*3], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*2] + mova m2, [coeffq+16*4] + mova m3, [coeffq+16*6] + lea tx2q, [o(.pass1_end)] + jmp .pass1 +.pass1_end: + mova m4, [coeffq+16*1] + mova m5, [coeffq+16*3] + mova m6, [coeffq+16*5] + jmp r3 +.pass2: + mova m7, [o(pw_1697x16)] + mova [coeffq+16*6], m6 + REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 + mova m6, [coeffq+16*7] + IDTX16 6, 7, 7 + mova [coeffq+16*7], m6 + mova m6, [coeffq+16*6] + pmulhrsw m7, m6, [o(pw_1697x16)] + paddsw m6, m6 + paddsw m6, m7 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*4], m4 + jmp m(iadst_4x16_internal_8bpc).end2 + + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4, 8 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + mov r2d, 2 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)] +.dconly: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m5, m5 +.dconly_loop: + mova m1, [dstq] + mova m3, [dstq+strideq] + punpckhbw m2, m1, m5 + punpcklbw m1, m5 + punpckhbw m4, m3, m5 + punpcklbw m3, m5 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + mova [dstq], m1 + mova [dstq+strideq], m3 + lea dstq, [dstq+strideq*2] + dec r2d + jg .dconly_loop + jmp tx2q +.end: + RET +%endif +%endmacro + +%macro LOAD_7ROWS 2 ;src, stride + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] +%endmacro + +%macro SAVE_7ROWS 2 ;src, stride + mova [%1+%2*0], m0 + mova [%1+%2*1], m1 + mova [%1+%2*2], m2 + mova [%1+%2*3], m3 + mova [%1+%2*4], m4 + mova [%1+%2*5], m5 + mova [%1+%2*6], m6 +%endmacro + +%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] + punpckhwd m%5, m%4, m%1 ;packed in13 in3 + punpcklwd m%1, m%4 ;packed in1 in15 + punpcklwd m%4, m%3, m%2 ;packed in9 in7 + punpckhwd m%2, m%3 ;packed in5 in11 + mova m%7, [o(pd_2048)] + ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a + ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a + ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a + ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a + psubsw m%6, m%1, m%4 ;low: t9 high: t14 + paddsw m%1, m%4 ;low: t8 high: t15 + psubsw m%4, m%5, m%2 ;low: t10 high: t13 + paddsw m%5, m%2 ;low: t11 high: t12 + mova m%2, [o(deint_shuf2)] + pshufb m%6, m%2 + pshufb m%4, m%2 + ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a + ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a + psubsw m%3, m%1, m%5 ;low: t11a high: t12a + paddsw m%1, m%5 ;low: t8a high: t15a + psubsw m%5, m%6, m%4 ;low: t10 high: t13 + paddsw m%6, m%4 ;low: t9 high: t14 + pshufb m%3, m%2 + pshufb m%5, m%2 + ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11 + ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a + packssdw m%2, m%4 ;low: t11 high: t10a + packssdw m%3, m%5 ;low: t12 high: t13a + punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 + punpcklqdq m%1, m%6 ;low: t8a high: t9 +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call .main + +.pass1_end: + punpckhwd m7, m0, m2 ;packed out1, out5 + punpcklwd m0, m2 ;packed out0, out4 + punpcklwd m2, m1, m3 ;packed out3, out7 + punpckhwd m1, m3 ;packed out2, out6 + mova [coeffq+16*6], m7 + mova m7, [coeffq+16*7] + punpckhwd m3, m4, m6 ;packed out9, out13 + punpcklwd m4, m6 ;packed out8, out12 + punpcklwd m6, m5, m7 ;packed out11, out15 + punpckhwd m5, m7 ;packed out10, out14 + +.pass1_end2: + mova m7, [o(pw_16384)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*6] + mova [coeffq+16*6], m7 + +.pass1_end3: + punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high + punpcklwd m3, m6 ;packed 9, 10, 13, 15 low + punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high + punpcklwd m4, m5 ;packed 8, 10, 12, 14 low + punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) + punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) + punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) + punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) + mova [coeffq+16*7], m3 + mova m3, [coeffq+16*6] + punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high + punpcklwd m3, m2 ;packed 1, 3, 5, 7 low + punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high + punpcklwd m0, m1 ;packed 0, 2, 4, 6 low + punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) + punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) + punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) + punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) + jmp tx2q + +.pass2: + lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)] + +.pass2_end: + mova [coeffq+16*4], m4 + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + lea r3, [dstq+8] + call tx2q + + add coeffq, 16*4 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + mov dstq, r3 + jmp tx2q + +ALIGN function_align +cglobal_label .main + punpckhqdq m7, m0, m1 ;low:in1 high:in3 + punpcklqdq m0, m1 + punpcklqdq m1, m2, m3 + punpckhqdq m3, m2 ;low:in7 high:in5 + mova [coeffq+16*4], m7 + mova [coeffq+16*5], m3 + mova m7, [coeffq+16*7] + punpcklqdq m2, m4, m5 + punpckhqdq m4, m5 ;low:in9 high:in11 + punpcklqdq m3, m6, m7 + punpckhqdq m7, m6 ;low:in15 high:in13 + mova [coeffq+16*6], m4 + IDCT8_1D_PACKED + mova m6, [coeffq+16*4] + mova m4, [coeffq+16*5] + mova m5, [coeffq+16*6] + mova [coeffq+16*4], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*6], m3 + + IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 + + mova m1, [coeffq+16*4] + psubsw m3, m0, m7 ;low:out15 high:out14 + paddsw m0, m7 ;low:out0 high:out1 + psubsw m7, m1, m5 ;low:out12 high:out13 + paddsw m1, m5 ;low:out3 high:out2 + mova [coeffq+16*7], m3 + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*6] + psubsw m5, m2, m4 ;low:out11 high:out10 + paddsw m2, m4 ;low:out4 high:out5 + psubsw m4, m3, m6 ;low:out8 high:out9 + paddsw m3, m6 ;low:out7 high:out6 + mova m6, m7 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call .main + call .main_pass1_end + + punpckhwd m6, m7, m0 ;packed -out11, -out15 + punpcklwd m0, m7 ;packed out0, out4 + punpcklwd m7, m3, m4 ;packed -out3, -out7 + punpckhwd m4, m3 ;packed out8, out12 + mova m1, [coeffq+16*6] + punpcklwd m3, m1, m5 ;packed -out1, -out5 + punpckhwd m5, m1 ;packed out10, out14 + mova m1, [coeffq+16*7] + mova [coeffq+16*6], m3 + mova [coeffq+16*7], m7 + punpckhwd m3, m2, m1 ;packed -out9, -out13 + punpcklwd m1, m2 ;packed out2, out6 + + mova m7, [o(pw_16384)] + +.pass1_end: + REPX {pmulhrsw x, m7}, m0, m1, m4, m5 + pxor m2, m2 + psubw m2, m7 + mova m7, [coeffq+16*6] + REPX {pmulhrsw x, m2}, m7, m3, m6 + pmulhrsw m2, [coeffq+16*7] + mova [coeffq+16*6], m7 + jmp m(idct_16x4_internal_8bpc).pass1_end3 + +.pass2: + lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end + +ALIGN function_align +cglobal_label .main + mova [coeffq+16*6], m0 + pshufd m0, m1, q1032 + pshufd m2, m2, q1032 + punpckhwd m1, m6, m0 ;packed in13, in2 + punpcklwd m0, m6 ;packed in3, in12 + punpckhwd m7, m5, m2 ;packed in11, in4 + punpcklwd m2, m5 ;packed in5, in10 + mova m6, [o(pd_2048)] + ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 + ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 + ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 + ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 + psubsw m5, m1, m2 ;low:t10a high:t11a + paddsw m1, m2 ;low:t2a high:t3a + psubsw m2, m7, m0 ;low:t12a high:t13a + paddsw m7, m0 ;low:t4a high:t5a + punpcklqdq m0, m5 + punpckhwd m0, m5 ;packed t10a, t11a + punpcklqdq m5, m2 + punpckhwd m2, m5 ;packed t13a, t12a + ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 + ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 + mova [coeffq+16*4], m1 + mova [coeffq+16*5], m7 + mova m1, [coeffq+16*6] + mova m7, [coeffq+16*7] + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + punpckhwd m5, m7, m1 ;packed in15, in0 + punpcklwd m1, m7 ;packed in1, in14 + punpckhwd m7, m4, m3 ;packed in9, in6 + punpcklwd m3, m4 ;packed in7, in8 + ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 + ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 + ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 + ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 + psubsw m4, m5, m3 ;low:t8a high:t9a + paddsw m5, m3 ;low:t0a high:t1a + psubsw m3, m7, m1 ;low:t14a high:t15a + paddsw m7, m1 ;low:t6a high:t7a + punpcklqdq m1, m4 + punpckhwd m1, m4 ;packed t8a, t9a + punpcklqdq m4, m3 + punpckhwd m3, m4 ;packed t15a, t14a + ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 + ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 + paddsw m4, m1, m2 ;low:t12a high:t13a + psubsw m1, m2 ;low:t8a high:t9a + psubsw m2, m0, m3 ;low:t14a high:t15a + paddsw m0, m3 ;low:t10a high:t11a + punpcklqdq m3, m1 + punpckhwd m3, m1 ;packed t12a, t13a + punpcklqdq m1, m2 + punpckhwd m2, m1 ;packed t15a, t14a + ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 + ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 + psubsw m1, m3, m2 ;low:t14a high:t15a + paddsw m3, m2 ;low:out2 high:-out13 + psubsw m2, m4, m0 ;low:t10 high:t11 + paddsw m0, m4 ;low:-out1 high:out14 + mova [coeffq+16*6], m0 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*4] + mova m3, [coeffq+16*5] + psubsw m4, m5, m3 ;low:t4 high:t5 + paddsw m5, m3 ;low:t0 high:t1 + psubsw m3, m0, m7 ;low:t6 high:t7 + paddsw m0, m7 ;low:t2 high:t3 + punpcklqdq m7, m4 + punpckhwd m7, m4 ;packed t4, t5 + punpcklqdq m4, m3 + punpckhwd m3, m4 ;packed t7, t6 + ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a + ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a + psubsw m4, m5, m0 ;low:t2a high:t3a + paddsw m0, m5 ;low:out0 high:-out15 + psubsw m5, m7, m3 ;low:t6 high:t7 + paddsw m3, m7 ;low:-out3 high:out12 + ret +ALIGN function_align +.main_pass1_end: + mova m7, [o(deint_shuf1)] + mova [coeffq+16*4], m0 + mova [coeffq+16*5], m3 + mova m0, [o(pw_2896_m2896)] + mova m3, [o(pw_2896_2896)] + pshufb m1, m7 ;t14a t15a + pshufb m2, m7 ;t10 t11 + pshufb m4, m7 ;t2a t3a + pshufb m5, m7 ;t6 t7 + pmaddwd m7, m0, m2 + pmaddwd m2, m3 + paddd m7, m6 + paddd m2, m6 + psrad m7, 12 + psrad m2, 12 + packssdw m2, m7 ;low:out6 high:-out9 + pmaddwd m7, m0, m4 + pmaddwd m4, m3 + paddd m7, m6 + paddd m4, m6 + psrad m7, 12 + psrad m4, 12 + packssdw m4, m7 ;low:-out7 high:out8 + pmaddwd m7, m3, m5 + pmaddwd m5, m0 + paddd m7, m6 + paddd m5, m6 + psrad m7, 12 + psrad m5, 12 + packssdw m7, m5 ;low:out4 high:-out11 + pmaddwd m5, m3, m1 + pmaddwd m1, m0 + paddd m5, m6 + paddd m1, m6 + psrad m5, 12 + psrad m1, 12 + packssdw m5, m1 ;low:-out5 high:out10 + mova m0, [coeffq+16*4] + mova m3, [coeffq+16*5] + ret +ALIGN function_align +cglobal_label .main_pass2_end + mova m7, [o(pw_2896x8)] + punpckhqdq m6, m2, m1 ;low:t11 high:t15a + punpcklqdq m2, m1 ;low:t10 high:t14a + psubsw m1, m2, m6 + paddsw m2, m6 + punpckhqdq m6, m4, m5 ;low:t3a high:t7 + punpcklqdq m4, m5 ;low:t2a high:t6 + psubsw m5, m4, m6 + paddsw m4, m6 + pmulhrsw m1, m7 ;low:-out9 high:out10 + pmulhrsw m2, m7 ;low:out6 high:-out5 + pmulhrsw m5, m7 ;low:out8 high:-out11 + pmulhrsw m4, m7 ;low:-out7 high:out4 + punpckhqdq m7, m4, m5 ;low:out4 high:-out11 + punpcklqdq m4, m5 ;low:-out7 high:out8 + punpckhqdq m5, m2, m1 ;low:-out5 high:out10 + punpcklqdq m2, m1 ;low:out6 high:-out9 + ret + + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass1_end + + punpcklwd m6, m7, m0 ;packed out11, out15 + punpckhwd m0, m7 ;packed -out0, -out4 + punpckhwd m7, m3, m4 ;packed out3, out7 + punpcklwd m4, m3 ;packed -out8, -out12 + mova m1, [coeffq+16*6] + punpckhwd m3, m1, m5 ;packed out1, out5 + punpcklwd m5, m1 ;packed -out10, -out14 + mova m1, [coeffq+16*7] + mova [coeffq+16*6], m3 + mova [coeffq+16*7], m7 + punpcklwd m3, m2, m1 ;packed out9, out13 + punpckhwd m1, m2 ;packed -out2, -out6 + + mova m7, [o(pw_m16384)] + jmp m(iadst_16x4_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end + + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m1, [coeffq+16*6] + mova m0, [coeffq+16*5] + mova m2, [coeffq+16*7] + mova m6, [o(pw_1697x16)] + mova m7, [o(pw_16384)] + pmulhrsw m4, m6, m1 + pmulhrsw m3, m6, m0 + pmulhrsw m5, m6, m2 + pmulhrsw m4, m7 + pmulhrsw m3, m7 + pmulhrsw m5, m7 + paddsw m1, m4 + paddsw m0, m3 + paddsw m5, m2 + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + mova m4, [coeffq+16*4] + mova [coeffq+16*6], m1 + mova [coeffq+16*5], m0 + mova [coeffq+16*7], m5 + pmulhrsw m0, m6, m2 + pmulhrsw m1, m6, m3 + pmulhrsw m5, m6, m4 + pmulhrsw m0, m7 + pmulhrsw m1, m7 + pmulhrsw m5, m7 + paddsw m2, m0 + paddsw m3, m1 + paddsw m4, m5 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + pmulhrsw m5, m6, m0 + pmulhrsw m6, m1 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + paddsw m0, m5 + paddsw m1, m6 + mova m6, [coeffq+16*6] + mova m5, [coeffq+16*5] + punpckhwd m7, m0, m2 ;packed out1, out5 + punpcklwd m0, m2 ;packed out0, out4 + punpckhwd m2, m1, m3 ;packed out3, out7 + punpcklwd m1, m3 ;packed out2, out6 + mova [coeffq+16*6], m7 + mova m7, [coeffq+16*7] + punpckhwd m3, m4, m6 ;packed out9, out13 + punpcklwd m4, m6 ;packed out8, out12 + punpckhwd m6, m5, m7 ;packed out11, out15 + punpcklwd m5, m7 ;packed out10, out14 + jmp m(idct_16x4_internal_8bpc).pass1_end3 + +.pass2: + lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end + + +%macro SAVE_8ROWS 2 ;src, stride + mova [%1+%2*0], m0 + mova [%1+%2*1], m1 + mova [%1+%2*2], m2 + mova [%1+%2*3], m3 + mova [%1+%2*4], m4 + mova [%1+%2*5], m5 + mova [%1+%2*6], m6 + mova [%1+%2*7], m7 +%endmacro + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16, 8, 16*16 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mova m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + pmulhrsw m0, m2 + psrlw m2, 3 ; pw_2048 + pmulhrsw m0, m1 + pmulhrsw m0, m2 + mov r3d, 4 + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop +.end: + RET +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, identity + +cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_8x8_internal_8bpc).pass1)] + +.pass1: + LOAD_8ROWS coeffq+16*1, 32, 1 + mov [rsp+gprsize+16*11], tx2q + lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)] + jmp r3 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32, 1 + mov tx2q, [rsp+gprsize+16*11] + jmp r3 + +.pass2: + lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] + +.pass2_pre: + mova [coeffq+16*2 ], m1 + mova [coeffq+16*6 ], m3 + mova [coeffq+16*10], m5 + mova [coeffq+16*14], m7 + mova m1, m2 + mova m2, m4 + mova m3, m6 + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*5 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*13] + +.pass2_main: + call m(idct_8x8_internal_8bpc).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [coeffq+16*2 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*10] + mova m3, [coeffq+16*14] + mova m4, [coeffq+16*3 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*15] + call m(idct_16x8_internal_8bpc).main + + mov r3, dstq + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)] + jmp m(idct_8x16_internal_8bpc).pass1 + +.pass2: + lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] + +.pass2_pre: + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + +.pass2_main: + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*3 ] + mova m6, [coeffq+16*13] + mova m7, [coeffq+16*15] + mova [rsp+gprsize+16*3], m4 + mova [rsp+gprsize+16*4], m5 + mova [rsp+gprsize+16*9], m6 + mova [rsp+gprsize+32*5], m7 + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*11] + + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + + mov r3, dstq + lea dstq, [dstq+strideq*8] + jmp m(iadst_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iadst_8x8_internal_8bpc).end + + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)] + jmp m(idct_8x16_internal_8bpc).pass1 + +.pass2: + lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)] + lea r3, [dstq+strideq*8] + +.pass2_pre: + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + +.pass2_main: + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*3 ] + mova m6, [coeffq+16*13] + mova m7, [coeffq+16*15] + mova [rsp+gprsize+16*3], m4 + mova [rsp+gprsize+16*4], m5 + mova [rsp+gprsize+16*9], m6 + mova [rsp+gprsize+32*5], m7 + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*11] + + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + jmp m(iflipadst_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iflipadst_8x8_internal_8bpc).end + + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*1, 32, 1 + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32, 1 + mov tx2q, r3 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass2: + lea tx2q, [o(.end1)] + +.end: + mova [rsp+gprsize+16*0], m7 + mova [rsp+gprsize+16*1], m6 + mova m7, [o(pw_1697x16)] + REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 + mova m6, [rsp+gprsize+16*1] + mova [rsp+gprsize+16*2], m5 + IDTX16 6, 5, 7 + mova m5, [rsp+gprsize+16*0] + IDTX16 5, 7, 7 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize+16*2] + mova [rsp+gprsize+16*0], m5 + mova [rsp+gprsize+16*1], m6 + mova [rsp+gprsize+16*2], m7 + jmp m(idct_8x8_internal_8bpc).end3 + +.end1: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + lea dstq, [dstq+strideq*2] + jmp .end + + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8, 8, 16*16 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r2d, 4 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.end: + RET +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, identity + +cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*0, 32, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*1, 32, 1 + call .main + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(.end)] + lea r3, [dstq+8] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).pass2_main + + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+16*2], m6 + mova [rsp+gprsize*2+32*5], m5 + + mova m6, [o(pd_2048)] + ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a + ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a + psubsw m2, m0, m4 ;t9 + paddsw m0, m4 ;t8 + psubsw m4, m7, m3 ;t14 + paddsw m7, m3 ;t15 + ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a + mova m3, [rsp+gprsize*2+16*1] + mova m5, [rsp+gprsize*2+32*5] + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+32*5], m4 + mova m2, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m7 + ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a + ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a + psubsw m4, m2, m3 ;t10 + paddsw m2, m3 ;t11 + psubsw m3, m1, m5 ;t13 + paddsw m1, m5 ;t12 + ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a + mova m7, [rsp+gprsize*2+32*5] + psubsw m6, m0, m2 ;t11a + paddsw m0, m2 ;t8a + paddsw m2, m7, m3 ;t9 + psubsw m7, m3 ;t10 + mova m5, [rsp+gprsize*2+16*0] + psubsw m3, m5, m0 ;out8 + paddsw m0, m5 ;out7 + mova [rsp+gprsize*2+32*5], m0 + mova m5, [rsp+gprsize*2+16*9] + psubsw m0, m5, m2 ;out9 + paddsw m2, m5 ;out6 + mova [rsp+gprsize*2+16*0], m0 + mova [rsp+gprsize*2+16*9], m2 + mova m0, [rsp+gprsize*2+16*1] + mova m2, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*1], m3 + psubsw m5, m0, m4 ;t13 + paddsw m0, m4 ;t14 + mova m3, [o(pd_2048)] + psubsw m4, m2, m1 ;t12a + paddsw m1, m2 ;t15a + mova [rsp+gprsize*2+16*2], m1 + ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a + ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12 + mova m3, [rsp+gprsize*2+16*8] + psubsw m2, m3, m5 ;out10 + paddsw m3, m5 ;out5 + mova m5, [rsp+gprsize*2+16*7] + mova [rsp+gprsize*2+16*8], m3 + psubsw m3, m5, m4 ;out11 + paddsw m5, m4 ;out4 + mova m4, [rsp+gprsize*2+16*6] + mova [rsp+gprsize*2+16*7], m5 + paddsw m5, m4, m6 ;out3 + psubsw m4, m6 ;out12 + mova m6, [rsp+gprsize*2+16*5] + mova [rsp+gprsize*2+16*6], m5 + psubsw m5, m6, m7 ;out13 + paddsw m6, m7 ;out2 + mova m7, [rsp+gprsize*2+16*4] + mova [rsp+gprsize*2+16*5], m6 + psubsw m6, m7, m0 ;out14 + paddsw m7, m0 ;out1 + mova m1, [rsp+gprsize*2+16*2] + mova m0, [rsp+gprsize*2+16*3] + mova [rsp+gprsize*2+16*4], m7 + psubsw m7, m0, m1 ;out15 + paddsw m0, m1 ;out0 + mova [rsp+gprsize*2+16*3], m0 + mova m1, [rsp+gprsize*2+16*0] + mova m0, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*0], m7 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [coeffq+16*0 ] + pmulhrsw m1, m7, [coeffq+16*1 ] + pmulhrsw m2, m7, [coeffq+16*14] + pmulhrsw m3, m7, [coeffq+16*15] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + pmulhrsw m0, m7, [coeffq+16*6 ] + pmulhrsw m1, m7, [coeffq+16*7 ] + pmulhrsw m2, m7, [coeffq+16*8 ] + pmulhrsw m3, m7, [coeffq+16*9 ] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + pmulhrsw m0, m7, [coeffq+16*2 ] + pmulhrsw m1, m7, [coeffq+16*3 ] + pmulhrsw m2, m7, [coeffq+16*4 ] + pmulhrsw m3, m7, [coeffq+16*5 ] + pmulhrsw m4, m7, [coeffq+16*10] + pmulhrsw m5, m7, [coeffq+16*11] + pmulhrsw m6, m7, [coeffq+16*12] + pmulhrsw m7, [coeffq+16*13] + + call .main + call .main_pass1_end + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + jmp m(iadst_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(iadst_8x8_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(.end)] + lea r3, [dstq+8] + jmp m(iadst_8x8_internal_8bpc).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iadst_8x8_internal_8bpc).pass2_main + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*0], m1 + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+16*2], m6 + + mova m6, [o(pd_2048)] + ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 + ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 + psubsw m1, m0, m4 ;t10a + paddsw m0, m4 ;t2a + psubsw m4, m7, m3 ;t11a + paddsw m3, m7 ;t3a + ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 + mova m2, [rsp+gprsize*2+16*0] ;in3 + mova m7, [rsp+gprsize*2+16*1] ;in4 + mova [rsp+gprsize*2+16*0], m1 ;t11 + mova [rsp+gprsize*2+16*1], m4 ;t10 + mova m1, [rsp+gprsize*2+16*2] ;in12 + mova [rsp+gprsize*2+16*2], m0 ;t2a + ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 + ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 + psubsw m0, m7, m1 ;t12a + paddsw m1, m7 ;t4a + psubsw m4, m5, m2 ;t13a + paddsw m5, m2 ;t5a + ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 + mova m2, [rsp+gprsize*2+16*8] ;in1 + mova m7, [rsp+gprsize*2+16*9] ;in14 + mova [rsp+gprsize*2+16*8], m4 ;t12 + mova [rsp+gprsize*2+16*9], m0 ;t13 + mova m4, [rsp+gprsize*2+16*4] ;in9 + mova m0, [rsp+gprsize*2+16*5] ;in6 + mova [rsp+gprsize*2+16*4], m1 ;t4a + mova [rsp+gprsize*2+16*5], m5 ;t5a + ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 + ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 + psubsw m1, m0, m7 ;t14a + paddsw m0, m7 ;t6a + psubsw m5, m4, m2 ;t15a + paddsw m4, m2 ;t7a + ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 + mova m2, [rsp+gprsize*2+16*2] ;t2a + mova [rsp+gprsize*2+16*2], m5 ;t14 + psubsw m7, m2, m0 ;t6 + paddsw m2, m0 ;t2 + psubsw m0, m3, m4 ;t7 + paddsw m3, m4 ;t3 + ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a + mova m4, [rsp+gprsize*2+16*7] ;in0 + mova m5, [rsp+gprsize*2+32*5] ;in15 + mova [rsp+gprsize*2+16*7], m3 ;t3 + mova [rsp+gprsize*2+32*5], m1 ;t15 + mova m1, [rsp+gprsize*2+16*6] ;in7 + mova m3, [rsp+gprsize*2+16*3] ;in8 + mova [rsp+gprsize*2+16*6], m7 ;t7a + mova [rsp+gprsize*2+16*3], m0 ;t6a + ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 + ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 + psubsw m0, m4, m3 ;t8a + paddsw m4, m3 ;t0a + psubsw m3, m5, m1 ;t9a + paddsw m5, m1 ;t1a + ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 + mova m1, [rsp+gprsize*2+16*4] ;t4a + mova m7, [rsp+gprsize*2+16*5] ;t5a + mova [rsp+gprsize*2+16*4], m3 ;t8 + mova [rsp+gprsize*2+16*5], m0 ;t9 + psubsw m0, m4, m1 ;t4 + paddsw m4, m1 ;t0 + psubsw m3, m5, m7 ;t5 + paddsw m5, m7 ;t1 + ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a + mova m7, [rsp+gprsize*2+16*3] ;t6a + psubsw m1, m4, m2 ;t2a + paddsw m4, m2 ;out0 + mova [rsp+gprsize*2+16*3], m4 ;out0 + mova m4, [rsp+gprsize*2+16*6] ;t7a + psubsw m2, m3, m7 ;t6 + paddsw m3, m7 ;-out3 + mova [rsp+gprsize*2+16*6], m3 ;-out3 + psubsw m3, m0, m4 ;t7 + paddsw m0, m4 ;out12 + mova [rsp+gprsize*2+16*12], m3 + mova m3, [rsp+gprsize*2+16*7] ;t3 + mova [rsp+gprsize*2+16* 7], m2 ;out4 + psubsw m2, m5, m3 ;t3a + paddsw m5, m3 ;-out15 + mova [rsp+gprsize*2+16*11], m2 + mova m2, [rsp+gprsize*2+32*5] ;t15 + mova [rsp+gprsize*2+16*10], m1 ;-out7 + mova m1, [rsp+gprsize*2+16*0] ;t11 + mova [rsp+gprsize*2+16*0 ], m5 ;-out15 + mova m3, [rsp+gprsize*2+16*1] ;t10 + mova [rsp+gprsize*2+16*1 ], m4 ;-out11 + mova m4, [rsp+gprsize*2+16*2] ;t14 + mova [rsp+gprsize*2+16*2 ], m0 ;out12 + psubsw m0, m3, m4 ;t14a + paddsw m3, m4 ;t10a + psubsw m5, m1, m2 ;t15a + paddsw m1, m2 ;t11a + ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 + mova m2, [rsp+gprsize*2+16*4] ;t8 + mova m4, [rsp+gprsize*2+16*5] ;t9 + mova [rsp+gprsize*2+16*4], m3 ;t10a + mova [rsp+gprsize*2+16*5], m1 ;t11a + mova m3, [rsp+gprsize*2+16*8] ;t12 + mova m1, [rsp+gprsize*2+16*9] ;t13 + mova [rsp+gprsize*2+16*8], m5 ;t14 + mova [rsp+gprsize*2+16*9], m0 ;t15 + psubsw m5, m2, m3 ;t12a + paddsw m2, m3 ;t8a + psubsw m0, m4, m1 ;t13a + paddsw m4, m1 ;t9a + ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 + mova m6, [rsp+gprsize*2+16*4] ;t10a + mova m1, [rsp+gprsize*2+16*5] ;t11a + psubsw m3, m2, m6 ;t10 + paddsw m2, m6 ;-out1 + paddsw m6, m4, m1 ;out14 + psubsw m4, m1 ;t11 + mova [rsp+gprsize*2+16*14], m4 + mova [rsp+gprsize*2+16* 4], m2 ;-out1 + mova m4, [rsp+gprsize*2+16*8] ;t14 + mova m2, [rsp+gprsize*2+16*9] ;t15 + mova [rsp+gprsize*2+16* 9], m3 ;out6 + psubsw m3, m0, m4 ;t14a + paddsw m0, m4 ;out2 + psubsw m4, m5, m2 ;t15a + paddsw m5, m2 ;-out13 + mova [rsp+gprsize*2+16* 5], m0 ;out2 + ret +ALIGN function_align +.main_pass1_end: + mova m0, [rsp+gprsize*2+16*14] + mova [rsp+gprsize*2+16*14], m5 + mova [rsp+gprsize*2+16*15], m6 + mova m5, [o(pw_2896_2896)] + mova m6, [o(pw_2896_m2896)] + mova m7, [o(pd_2048)] + punpcklwd m2, m3, m4 + punpckhwd m3, m4 + pmaddwd m4, m5, m2 + pmaddwd m2, m6 + pmaddwd m1, m5, m3 + pmaddwd m3, m6 + REPX {paddd x, m7}, m4, m2, m1, m3 + REPX {psrad x, 12}, m4, m1, m2, m3 + packssdw m4, m1 ;-out5 + packssdw m2, m3 ;out10 + mova [rsp+gprsize*2+16* 8], m4 + mova m3, [rsp+gprsize*2+16* 9] + punpcklwd m1, m3, m0 + punpckhwd m3, m0 + pmaddwd m0, m5, m1 + pmaddwd m1, m6 + pmaddwd m4, m5, m3 + pmaddwd m3, m6 + REPX {paddd x, m7}, m0, m1, m4, m3 + REPX {psrad x, 12}, m0, m4, m1, m3 + packssdw m0, m4 ;out6 + packssdw m1, m3 ;-out9 + mova [rsp+gprsize*2+16* 9], m0 + mova m0, [rsp+gprsize*2+16* 7] + mova m4, [rsp+gprsize*2+16*12] + punpcklwd m3, m0, m4 + punpckhwd m0, m4 + pmaddwd m4, m5, m3 + pmaddwd m3, m6 + pmaddwd m5, m0 + pmaddwd m0, m6 + REPX {paddd x, m7}, m4, m3, m5, m0 + REPX {psrad x, 12}, m4, m5, m3, m0 + packssdw m4, m5 ;out4 + packssdw m3, m0 ;-out11 + mova [rsp+gprsize*2+16* 7], m4 + mova m4, [rsp+gprsize*2+16*10] + mova m5, [rsp+gprsize*2+16*11] + punpcklwd m0, m4, m5 + punpckhwd m4, m5 + pmaddwd m5, m0, [o(pw_2896_2896)] + pmaddwd m0, m6 + pmaddwd m6, m4 + pmaddwd m4, [o(pw_2896_2896)] + REPX {paddd x, m7}, m5, m0, m6, m4 + REPX {psrad x, 12}, m0, m6, m5, m4 + packssdw m0, m6 ;out8 + packssdw m5, m4 ;-out7 + mova [rsp+gprsize*2+16*10], m5 + mova m4, [rsp+gprsize*2+16* 2] ;out12 + mova m5, [rsp+gprsize*2+16*14] ;-out13 + mova m6, [rsp+gprsize*2+16*15] ;out14 + ret +ALIGN function_align +cglobal_label .main_pass2_end + mova m7, [o(pw_2896x8)] + mova m1, [rsp+gprsize*2+16* 9] + mova m2, [rsp+gprsize*2+16*14] + paddsw m0, m1, m2 + psubsw m1, m2 + pmulhrsw m0, m7 ;out6 + pmulhrsw m1, m7 ;-out9 + mova [rsp+gprsize*2+16* 9], m0 + psubsw m2, m3, m4 + paddsw m3, m4 + pmulhrsw m2, m7 ;out10 + pmulhrsw m3, m7 ;-out5 + mova [rsp+gprsize*2+16* 8], m3 + mova m3, [rsp+gprsize*2+16* 7] + mova m4, [rsp+gprsize*2+16*12] + paddsw m0, m3, m4 + psubsw m3, m4 + pmulhrsw m0, m7 ;out4 + pmulhrsw m3, m7 ;-out11 + mova [rsp+gprsize*2+16* 7], m0 + mova m0, [rsp+gprsize*2+16*10] + paddsw m4, m0, [rsp+gprsize*2+16*11] + psubsw m0, [rsp+gprsize*2+16*11] + pmulhrsw m4, m7 ;-out7 + pmulhrsw m0, m7 ;out8 + mova [rsp+gprsize*2+16*10], m4 + mova m4, [rsp+gprsize*2+16*2 ] ;out12 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [coeffq+16*0 ] + pmulhrsw m1, m7, [coeffq+16*1 ] + pmulhrsw m2, m7, [coeffq+16*14] + pmulhrsw m3, m7, [coeffq+16*15] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + pmulhrsw m0, m7, [coeffq+16*6 ] + pmulhrsw m1, m7, [coeffq+16*7 ] + pmulhrsw m2, m7, [coeffq+16*8 ] + pmulhrsw m3, m7, [coeffq+16*9 ] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + pmulhrsw m0, m7, [coeffq+16*2 ] + pmulhrsw m1, m7, [coeffq+16*3 ] + pmulhrsw m2, m7, [coeffq+16*4 ] + pmulhrsw m3, m7, [coeffq+16*5 ] + pmulhrsw m4, m7, [coeffq+16*10] + pmulhrsw m5, m7, [coeffq+16*11] + pmulhrsw m6, m7, [coeffq+16*12] + pmulhrsw m7, [coeffq+16*13] + + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(.end)] + lea r3, [dstq+8] + jmp m(iflipadst_8x8_internal_8bpc).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iflipadst_8x8_internal_8bpc).pass2_main + + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + add coeffq, 16*16 + mova m4, [coeffq-16*7] + mova m5, [coeffq-16*5] + mova m6, [coeffq-16*3] + mova m7, [coeffq-16*1] + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + +.pass1: + mova m0, [o(pw_2896x8)] + mova m2, [o(pw_1697x16)] + mova m3, [o(pw_16384)] + sub coeffq, 8*16 + REPX {pmulhrsw x, m0}, m4, m5, m6, m7 + pmulhrsw m1, m2, m4 + pmulhrsw m1, m3 + paddsw m1, m4 ; 1 + pmulhrsw m4, m2, m5 + pmulhrsw m4, m3 + paddsw m4, m5 ; 3 + pmulhrsw m5, m2, m6 + pmulhrsw m5, m3 + paddsw m5, m6 ; 5 + pmulhrsw m6, m2, m7 + pmulhrsw m6, m3 + paddsw m7, m6 ; 7 + pmulhrsw m6, m0, [coeffq+16*6] + mova [rsp+gprsize+16*0], m4 + pmulhrsw m4, m2, m6 + pmulhrsw m4, m3 + paddsw m6, m4 ; 6 + pmulhrsw m4, m0, [coeffq+16*4] + mova [rsp+gprsize+16*1], m6 + pmulhrsw m6, m2, m4 + pmulhrsw m6, m3 + paddsw m4, m6 ; 4 + pmulhrsw m6, m0, [coeffq+16*2] + pmulhrsw m0, [coeffq+16*0] + pmulhrsw m2, m6 + pmulhrsw m2, m3 + paddsw m2, m6 ; 2 + pmulhrsw m6, m0, [o(pw_1697x16)] + pmulhrsw m6, m3 + mova m3, [rsp+gprsize+16*0] + paddsw m0, m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass1_end: + mova [coeffq+16*1], m4 + mova [coeffq+16*3], m5 + mova [coeffq+16*5], m6 + mova [coeffq+16*7], m7 + mova m4, [coeffq-16*7] + mova m5, [coeffq-16*5] + mova m6, [coeffq-16*3] + mova m7, [coeffq-16*1] + mova [coeffq-16*7], m0 + mova [coeffq-16*5], m1 + mova [coeffq-16*3], m2 + mova [coeffq-16*1], m3 + mov tx2q, r3 + jmp .pass1 + +.pass2: + lea tx2q, [o(.end)] + lea r3, [dstq+8] + jmp m(iidentity_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iidentity_8x8_internal_8bpc).end + + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16, 8, 16*16 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r2d, 8 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.end: + RET +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, identity + +cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*1, 64 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*3, 64 + call m(idct_16x8_internal_8bpc).main + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*17, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(.pass1_end2)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass2: + lea tx2q, [o(.end)] + jmp m(idct_8x16_internal_8bpc).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.end1)] + mov dstq, r3 + lea r3, [dstq+8] + jmp m(idct_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + mov dstq, r3 + + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*4 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*12] + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*5 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*13] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] + jmp m(idct_8x16_internal_8bpc).pass2_main + + +%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 + mova m0, [coeffq+16*1 ] + mova m1, [coeffq+16*3 ] + mova m2, [coeffq+16*29] + mova m3, [coeffq+16*31] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + mova m0, [coeffq+16*13] + mova m1, [coeffq+16*15] + mova m2, [coeffq+16*17] + mova m3, [coeffq+16*19] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + mova m0, [coeffq+16*5 ] + mova m1, [coeffq+16*7 ] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*11] + mova m4, [coeffq+16*21] + mova m5, [coeffq+16*23] + mova m6, [coeffq+16*25] + mova m7, [coeffq+16*27] +%endmacro + +%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*2 ] + mova m2, [coeffq+16*28] + mova m3, [coeffq+16*30] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + mova m0, [coeffq+16*12] + mova m1, [coeffq+16*14] + mova m2, [coeffq+16*16] + mova m3, [coeffq+16*18] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m4, [coeffq+16*20] + mova m5, [coeffq+16*22] + mova m6, [coeffq+16*24] + mova m7, [coeffq+16*26] +%endmacro + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + ITX_16X16_ADST_LOAD_ODD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*17, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*1, 32 + ITX_16X16_ADST_LOAD_EVEN_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + lea tx2q, [o(.pass1_end2)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass2: + lea tx2q, [o(.end)] + jmp m(iadst_8x16_internal_8bpc).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.end1)] + mov dstq, r3 + lea r3, [dstq+8] + jmp m(iadst_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + mov dstq, r3 + + mova m4, [coeffq+16*0 ] + mova m5, [coeffq+16*2 ] + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m6, [coeffq+16*12] + mova m7, [coeffq+16*14] + mova [rsp+gprsize+16*7], m4 + mova [rsp+gprsize+16*8], m5 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] + jmp m(iadst_8x16_internal_8bpc).pass2_main + + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + ITX_16X16_ADST_LOAD_ODD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*17, 32 + ITX_16X16_ADST_LOAD_EVEN_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end2)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS coeffq+16* 0, 32 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass2: + lea tx2q, [o(.end)] + lea r3, [dstq+8] + jmp m(iflipadst_8x16_internal_8bpc).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.end1)] + lea dstq, [dstq+strideq*2] + jmp m(iflipadst_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + + mova m4, [coeffq+16*0 ] + mova m5, [coeffq+16*2 ] + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m6, [coeffq+16*12] + mova m7, [coeffq+16*14] + mova [rsp+gprsize+16*7], m4 + mova [rsp+gprsize+16*8], m5 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + + lea tx2q, [o(.end2)] + mov dstq, r3 + jmp m(iflipadst_8x16_internal_8bpc).pass2_main + +.end2: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + lea dstq, [dstq+strideq*2] + jmp m(iflipadst_8x8_internal_8bpc).end + + +%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 + pmulhrsw m%2, m%3, m%1 + psraw m%2, 1 + pavgw m%1, m%2 +%endmacro + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + add coeffq, 16*17 + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + +.pass1: + mova m6, [o(pw_1697x16)] + mova m7, [coeffq+32*6] + mova m0, [coeffq+32*0] + mova m1, [coeffq+32*1] + mova m2, [coeffq+32*2] + mova m3, [coeffq+32*3] + mova m4, [coeffq+32*4] + REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 + mova m5, [coeffq+32*5] + mova [rsp+gprsize+16*1], m7 + IDTX16B 5, 7, 6 + mova m7, [coeffq+32*7] + IDTX16B 7, 6, 6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass1_end: + SAVE_8ROWS coeffq, 32 + sub coeffq, 16 + lea tx2q, [o(.pass1_end1)] + jmp .pass1 + +.pass1_end1: + SAVE_8ROWS coeffq, 32 + sub coeffq, 15*16 + lea tx2q, [o(.pass1_end2)] + jmp .pass1 + +.pass1_end2: + SAVE_8ROWS coeffq, 32 + sub coeffq, 16 + mov tx2q, r3 + jmp .pass1 + +.pass2: + lea r3, [dstq+8] + lea tx2q, [o(.end1)] + +.end: + mova [rsp+gprsize+16*0], m7 + mova [rsp+gprsize+16*1], m4 + mova m7, [o(pw_1697x16)] + REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 + mova m4, [o(pw_2048)] + pmulhrsw m5, m4 + pmulhrsw m6, m4 + mova [rsp+gprsize+16*2], m5 + mova m5, [rsp+gprsize+16*1] + mova [rsp+gprsize+16*1], m6 + IDTX16 5, 6, 7 + mova m6, [rsp+gprsize+16*0] + IDTX16 6, 7, 7 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 + pmulhrsw m4, m5 + mova [rsp+gprsize+16*0], m6 + jmp m(idct_8x8_internal_8bpc).end3 + +.end1: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(.end2)] + lea dstq, [dstq+strideq*2] + jmp .end + +.end2: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + LOAD_8ROWS coeffq, 32 + lea tx2q, [o(.end3)] + mov dstq, r3 + jmp .end + +.end3: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + lea dstq, [dstq+strideq*2] + jmp .end + + +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_8x32_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + pmulhrsw m0, m2 + psrlw m2, 2 ;pw_2048 + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + mov r3d, 8 + lea tx2q, [o(.end)] + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop + +.end: + RET + + + +cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + cmp eobd, 106 + jle .fast + + LOAD_8ROWS coeffq+16*3, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1: + mova [rsp+gprsize+16*9 ], m0 ;in24 + mova [rsp+gprsize+16*10], m4 ;in28 + mova [rsp+gprsize+16*17], m2 ;in26 + mova [rsp+gprsize+16*18], m6 ;in30 + mova [rsp+gprsize+16*31], m1 ;in25 + mova [rsp+gprsize+16*30], m3 ;in27 + mova [rsp+gprsize+16*27], m5 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_1: + mova [rsp+gprsize+16*7 ], m0 ;in16 + mova [rsp+gprsize+16*8 ], m4 ;in20 + mova [rsp+gprsize+16*15], m2 ;in18 + mova [rsp+gprsize+16*16], m6 ;in22 + mova [rsp+gprsize+16*33], m1 ;in17 + mova [rsp+gprsize+16*28], m3 ;in19 + mova [rsp+gprsize+16*29], m5 ;in21 + mova [rsp+gprsize+16*32], m7 ;in23 + +.fast: + LOAD_8ROWS coeffq+16*1, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + mova [rsp+gprsize+16*5 ], m0 ;in8 + mova [rsp+gprsize+16*6 ], m4 ;in12 + mova [rsp+gprsize+16*13], m2 ;in10 + mova [rsp+gprsize+16*14], m6 ;in14 + mova [rsp+gprsize+16*21], m1 ;in9 + mova [rsp+gprsize+16*24], m3 ;in11 + mova [rsp+gprsize+16*25], m5 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + mova [rsp+gprsize+16*11], m2 ;in2 + mova [rsp+gprsize+16*12], m6 ;in6 + mova [rsp+gprsize+16*19], m1 ;in1 + mova [rsp+gprsize+16*26], m3 ;in3 + mova [rsp+gprsize+16*23], m5 ;in5 + mova [rsp+gprsize+16*22], m7 ;in7 + mova m1, m4 ;in4 + mova m2, [rsp+gprsize+16*5 ] ;in8 + mova m3, [rsp+gprsize+16*6 ] ;in12 + + cmp eobd, 106 + jg .full + + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + mova m0, [rsp+gprsize+16*11] + mova m1, [rsp+gprsize+16*12] + mova m2, [rsp+gprsize+16*13] + mova m3, [rsp+gprsize+16*14] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call .main_fast + jmp .pass2 + +.full: + mova m4, [rsp+gprsize+16*7 ] ;in16 + mova m5, [rsp+gprsize+16*8 ] ;in20 + mova m6, [rsp+gprsize+16*9 ] ;in24 + mova m7, [rsp+gprsize+16*10] ;in28 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + LOAD_8ROWS rsp+gprsize+16*11, 16 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + call .main + +.pass2: + lea r3, [o(.end6)] + +.end: + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(.end2)] + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + + jmp tx2q + +.end2: + lea tx2q, [o(.end3)] + jmp m(idct_8x8_internal_8bpc).end + +.end3: + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + lea tx2q, [o(.end4)] + jmp m(idct_8x8_internal_8bpc).end + +.end4: + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + lea tx2q, [o(.end5)] + jmp m(idct_8x8_internal_8bpc).end + +.end5: + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + mov tx2q, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end6: + ret + +ALIGN function_align +cglobal_label .main_veryfast + mova m0, [rsp+gprsize*2+16*19] ;in1 + pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31 + pmulhrsw m0, [o(pw_201x8)] ;t16,t17 + mova m7, [o(pd_2048)] + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*34], m3 ;t31 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*20], m3 ;t17a + mova [rsp+gprsize*2+16*33], m0 ;t30a + mova m1, [rsp+gprsize*2+16*22] ;in7 + pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29 + pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19 + mova [rsp+gprsize*2+16*22], m1 ;t19 + mova [rsp+gprsize*2+16*31], m2 ;t28 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m2 ;t18a + mova [rsp+gprsize*2+16*32], m1 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27 + pmulhrsw m0, [o(pw_995x8)] ;t20, t21 + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*30], m3 ;t27 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*24], m3 ;t21a + mova [rsp+gprsize*2+16*29], m0 ;t26a + mova m2, [rsp+gprsize*2+16*26] ;in3 + pxor m0, m0 + mova m3, m0 + pmulhrsw m1, m2, [o(pw_4052x8)] + pmulhrsw m2, [o(pw_m601x8)] + jmp .main2 + +ALIGN function_align +cglobal_label .main_fast ;bottom half is zero + mova m0, [rsp+gprsize*2+16*19] ;in1 + mova m1, [rsp+gprsize*2+16*20] ;in15 + pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a + pmulhrsw m0, [o(pw_201x8)] ;t16a + pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a + pmulhrsw m1, [o(pw_m2751x8)] ;t17a + mova m7, [o(pd_2048)] + psubsw m4, m0, m1 ;t17 + paddsw m0, m1 ;t16 + psubsw m5, m3, m2 ;t30 + paddsw m3, m2 ;t31 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*20], m5 ;t17a + mova [rsp+gprsize*2+16*33], m4 ;t30a + mova [rsp+gprsize*2+16*34], m3 ;t31 + mova m0, [rsp+gprsize*2+16*21] ;in9 + mova m1, [rsp+gprsize*2+16*22] ;in7 + pmulhrsw m3, m0, [o(pw_3703x8)] + pmulhrsw m0, [o(pw_1751x8)] + pmulhrsw m2, m1, [o(pw_3857x8)] + pmulhrsw m1, [o(pw_m1380x8)] + psubsw m4, m1, m0 ;t18 + paddsw m0, m1 ;t19 + psubsw m5, m2, m3 ;t29 + paddsw m3, m2 ;t28 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*22], m0 ;t19 + mova [rsp+gprsize*2+16*31], m3 ;t28 + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + mova m1, [rsp+gprsize*2+16*24] ;in11 + pmulhrsw m3, m0, [o(pw_3973x8)] + pmulhrsw m0, [o(pw_995x8)] + pmulhrsw m2, m1, [o(pw_3513x8)] + pmulhrsw m1, [o(pw_m2106x8)] + psubsw m4, m0, m1 ;t21 + paddsw m0, m1 ;t20 + psubsw m5, m3, m2 ;t26 + paddsw m3, m2 ;t27 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m4 ;t26a + mova [rsp+gprsize*2+16*30], m3 ;t27 + mova m0, [rsp+gprsize*2+16*25] ;in13 + mova m2, [rsp+gprsize*2+16*26] ;in3 + pmulhrsw m3, m0, [o(pw_3290x8)] + pmulhrsw m0, [o(pw_2440x8)] + pmulhrsw m1, m2, [o(pw_4052x8)] + pmulhrsw m2, [o(pw_m601x8)] + jmp .main2 + +ALIGN function_align +cglobal_label .main + mova m7, [o(pd_2048)] + mova m0, [rsp+gprsize*2+16*19] ;in1 + mova m1, [rsp+gprsize*2+16*20] ;in15 + mova m2, [rsp+gprsize*2+16*33] ;in17 + mova m3, [rsp+gprsize*2+16*34] ;in31 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a + psubsw m4, m0, m2 ;t17 + paddsw m0, m2 ;t16 + psubsw m5, m3, m1 ;t30 + paddsw m3, m1 ;t31 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*20], m5 ;t17a + mova [rsp+gprsize*2+16*33], m4 ;t30a + mova [rsp+gprsize*2+16*34], m3 ;t31 + mova m0, [rsp+gprsize*2+16*21] ;in9 + mova m1, [rsp+gprsize*2+16*22] ;in7 + mova m2, [rsp+gprsize*2+16*31] ;in25 + mova m3, [rsp+gprsize*2+16*32] ;in23 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a + psubsw m4, m2, m0 ;t18 + paddsw m0, m2 ;t19 + psubsw m5, m1, m3 ;t29 + paddsw m3, m1 ;t28 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*22], m0 ;t19 + mova [rsp+gprsize*2+16*31], m3 ;t28 + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + mova m1, [rsp+gprsize*2+16*24] ;in11 + mova m2, [rsp+gprsize*2+16*29] ;in21 + mova m3, [rsp+gprsize*2+16*30] ;in27 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a + psubsw m4, m0, m2 ;t21 + paddsw m0, m2 ;t20 + psubsw m5, m3, m1 ;t26 + paddsw m3, m1 ;t27 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m4 ;t26a + mova [rsp+gprsize*2+16*30], m3 ;t27 + mova m0, [rsp+gprsize*2+16*25] ;in13 + mova m1, [rsp+gprsize*2+16*26] ;in3 + mova m2, [rsp+gprsize*2+16*27] ;in29 + mova m3, [rsp+gprsize*2+16*28] ;in19 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a + +.main2: + psubsw m4, m2, m0 ;t22 + paddsw m0, m2 ;t23 + psubsw m5, m1, m3 ;t25 + paddsw m3, m1 ;t24 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a + mova m2, [rsp+gprsize*2+16*24] ;t21a + psubsw m1, m5, m2 ;t21 + paddsw m5, m2 ;t22 + mova [rsp+gprsize*2+16*25], m5 ;t22 + mova m2, [rsp+gprsize*2+16*29] ;t26a + psubsw m5, m4, m2 ;t26 + paddsw m4, m2 ;t25 + mova [rsp+gprsize*2+16*28], m4 ;t25 + ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m1 ;t26a + + mova m1, [rsp+gprsize*2+16*23] ;t20 + mova m5, [rsp+gprsize*2+16*30] ;t27 + psubsw m2, m0, m1 ;t20a + paddsw m0, m1 ;t23a + psubsw m6, m3, m5 ;t27a + paddsw m3, m5 ;t24a + ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27 + mova [rsp+gprsize*2+16*26], m0 ;t23a + mova [rsp+gprsize*2+16*27], m3 ;t24a + mova [rsp+gprsize*2+16*30], m2 ;t27 + + mova m0, [rsp+gprsize*2+16*20] ;t17a + mova m1, [rsp+gprsize*2+16*21] ;t18a + mova m2, [rsp+gprsize*2+16*32] ;t29a + mova m3, [rsp+gprsize*2+16*33] ;t30a + psubsw m4, m0, m1 ;t18 + paddsw m0, m1 ;t17 + psubsw m5, m3, m2 ;t29 + paddsw m3, m2 ;t30 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a + mova [rsp+gprsize*2+16*20], m0 ;t17 + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova [rsp+gprsize*2+16*33], m3 ;t30 + mova m0, [rsp+gprsize*2+16*19] ;t16 + mova m1, [rsp+gprsize*2+16*22] ;t19 + mova m2, [rsp+gprsize*2+16*31] ;t28 + mova m3, [rsp+gprsize*2+16*34] ;t31 + psubsw m4, m0, m1 ;t19a + paddsw m0, m1 ;t16a + psubsw m5, m3, m2 ;t28a + paddsw m3, m2 ;t31a + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28 + mova m2, [rsp+gprsize*2+16*15] ;tmp12 + psubsw m1, m5, m6 ;t20a + paddsw m5, m6 ;t19a + psubsw m6, m2, m5 ;out19 + paddsw m2, m5 ;out12 + mova m5, [rsp+gprsize*2+16*30] ;t27 + mova [rsp+gprsize*2+16*22], m6 ;out19 + mova [rsp+gprsize*2+16*15], m2 ;out12 + psubsw m6, m4, m5 ;t27a + paddsw m4, m5 ;t28a + ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27 + mova m2, [rsp+gprsize*2+16*6 ] ;tmp3 + psubsw m5, m2, m4 ;out28 + paddsw m2, m4 ;out3 + mova m4, [rsp+gprsize*2+16*14] ;tmp11 + mova [rsp+gprsize*2+16*31], m5 ;out28 + mova [rsp+gprsize*2+16*6 ], m2 ;out3 + psubsw m5, m4, m6 ;out20 + paddsw m4, m6 ;out11 + mova m2, [rsp+gprsize*2+16*7 ] ;tmp4 + mova [rsp+gprsize*2+16*23], m5 ;out20 + mova [rsp+gprsize*2+16*14], m4 ;out11 + psubsw m5, m2, m1 ;out27 + paddsw m2, m1 ;out4 + mova m1, [rsp+gprsize*2+16*26] ;t23a + mova m4, [rsp+gprsize*2+16*27] ;t24a + mova [rsp+gprsize*2+16*30], m5 ;out27 + mova [rsp+gprsize*2+16*7 ], m2 ;out4 + psubsw m5, m0, m1 ;t23 + paddsw m0, m1 ;t16 + psubsw m2, m3, m4 ;t24 + paddsw m3, m4 ;t31 + ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a + mova m6, [rsp+gprsize*2+16*18] ;tmp15 + psubsw m4, m6, m0 ;out16 + paddsw m6, m0 ;out15 + mova m0, [rsp+gprsize*2+16*3 ] ;tmp0 + mova m1, [rsp+gprsize*2+16*11] ;tmp8 + mova [rsp+gprsize*2+16*18], m6 ;out15 + mova [rsp+gprsize*2+16*19], m4 ;out16 + psubsw m6, m0, m3 ;out31 + paddsw m0, m3 ;out0 + psubsw m4, m1, m2 ;out23 + paddsw m1, m2 ;out8 + mova m3, [rsp+gprsize*2+16*10] ;tmp7 + mova [rsp+gprsize*2+16*34], m6 ;out31 + mova [rsp+gprsize*2+16*11], m1 ;out8 + mova [rsp+gprsize*2+16*26], m4 ;out23 + paddsw m6, m3, m5 ;out7 + psubsw m3, m5 ;out24 + mova m1, [rsp+gprsize*2+16*20] ;t17 + mova m5, [rsp+gprsize*2+16*25] ;t22 + mova m2, [rsp+gprsize*2+16*17] ;tmp14 + mova [rsp+gprsize*2+16*27], m3 ;out24 + psubsw m4, m1, m5 ;t22a + paddsw m1, m5 ;t17a + psubsw m3, m2, m1 ;out17 + paddsw m2, m1 ;out14 + mova m5, [rsp+gprsize*2+16*28] ;t25 + mova m1, [rsp+gprsize*2+16*33] ;t30 + mova [rsp+gprsize*2+16*17], m2 ;out14 + mova [rsp+gprsize*2+16*20], m3 ;out17 + psubsw m2, m1, m5 ;t25a + paddsw m1, m5 ;t30a + ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25 + mova m5, [rsp+gprsize*2+16*4 ] ;tmp1 + psubsw m3, m5, m1 ;out30 + paddsw m5, m1 ;out1 + mova m1, [rsp+gprsize*2+16*12] ;tmp9 + mova [rsp+gprsize*2+16*33], m3 ;out30 + mova [rsp+gprsize*2+16*4 ], m5 ;out1 + psubsw m3, m1, m2 ;out22 + paddsw m1, m2 ;out9 + mova m5, [rsp+gprsize*2+16*9 ] ;tmp6 + mova [rsp+gprsize*2+16*25], m3 ;out22 + mova [rsp+gprsize*2+16*12], m1 ;out9 + psubsw m3, m5, m4 ;out25 + paddsw m5, m4 ;out6 + mova m4, [rsp+gprsize*2+16*21] ;t18a + mova m1, [rsp+gprsize*2+16*24] ;t21a + mova m2, [rsp+gprsize*2+16*16] ;tmp13 + mova [rsp+gprsize*2+16*28], m3 ;out25 + mova [rsp+gprsize*2+16*9 ], m5 ;out6 + paddsw m3, m4, m1 ;t18 + psubsw m4, m1 ;t21 + psubsw m5, m2, m3 ;out18 + paddsw m2, m3 ;out13 + mova m1, [rsp+gprsize*2+16*29] ;t26a + mova m3, [rsp+gprsize*2+16*32] ;t29a + mova [rsp+gprsize*2+16*21], m5 ;out18 + mova [rsp+gprsize*2+16*16], m2 ;out13 + psubsw m5, m3, m1 ;t26 + paddsw m3, m1 ;t29 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a + mova m2, [rsp+gprsize*2+16*5 ] ;tmp2 + psubsw m1, m2, m3 ;out29 + paddsw m2, m3 ;out2 + mova m3, [rsp+gprsize*2+16*13] ;tmp10 + mova [rsp+gprsize*2+16*32], m1 ;out29 + psubsw m7, m3, m5 ;out21 + paddsw m3, m5 ;out10 + mova m5, [rsp+gprsize*2+16*8 ] ;tmp5 + mova [rsp+gprsize*2+16*24], m7 ;out21 + mova [rsp+gprsize*2+16*13], m3 ;out10 + psubsw m1, m5, m4 ;out26 + paddsw m5, m4 ;out5 + mova m7, m6 ;out7 + mova m3, [rsp+gprsize*2+16*6 ] ;out3 + mova m4, [rsp+gprsize*2+16*7 ] ;out4 + mova [rsp+gprsize*2+16*29], m1 ;out26 + mova m6, [rsp+gprsize*2+16*9 ] ;out6 + mova m1, [rsp+gprsize*2+16*4 ] ;out1 + ret + + +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_32x8_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 8 + lea tx2q, [o(.end)] + +.body: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m5, m5 + +.loop: + mova m1, [dstq+16*0] + mova m3, [dstq+16*1] + punpckhbw m2, m1, m5 + punpcklbw m1, m5 + punpckhbw m4, m3, m5 + punpcklbw m3, m5 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m3 + add dstq, strideq + dec r3d + jg .loop + jmp tx2q + +.end: + RET + + +cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+16*1, 32 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + cmp eobd, 106 + jg .full + call m(idct_8x32_internal_8bpc).main_fast + jmp .pass2 + +.full: + LOAD_8ROWS coeffq+16*17, 32 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + call m(idct_8x32_internal_8bpc).main + +.pass2: + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(.end)] + jmp m(idct_8x32_internal_8bpc).end1 + +.end: + mova m7, [o(pw_8192)] + lea tx2q, [o(.end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end1: + lea r3, [dstq+8] + lea tx2q, [o(.end2)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end2: + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end3: + mov dstq, r3 + add r3, 8 + lea tx2q, [o(.end4)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end4: + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end5: + mov dstq, r3 + add r3, 8 + lea tx2q, [o(.end6)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end6: + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end7: + mov dstq, r3 + lea tx2q, [o(.end8)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end8: + ret + + +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r5d, 4 + mov tx2d, 2 + cmp eobd, 107 + cmovns tx2d, r5d + mov r3d, tx2d +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] +.loop: + LOAD_8ROWS coeffq+16*0, 64 + paddsw m6, [o(pw_5)] + mova [rsp+16*1], m6 + mova m6, [o(pw_5)] + REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + call m(idct_8x8_internal_8bpc).pass1_end3 + REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 + mova [rsp+16*2], m5 + mova [rsp+16*1], m6 + mova [rsp+16*0], m7 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + pxor m7, m7 + REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + add coeffq, 16 + dec r3d + jg .loop + RET + +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r5d, 4 + mov tx2d, 2 + cmp eobd, 107 + cmovns tx2d, r5d + mov r3d, tx2d +%if ARCH_X86_32 + LEA r5, $$ +%endif + +.loop: + LOAD_8ROWS coeffq+16*0, 16 + pmulhrsw m6, [o(pw_4096)] + mova [rsp+16*1], m6 + mova m6, [o(pw_4096)] + REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] + call m(idct_8x8_internal_8bpc).pass1_end3 + + mov [rsp+16*3], dstq + mova [rsp+16*2], m5 + mova [rsp+16*1], m6 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + call m(idct_8x8_internal_8bpc).end3 + + add coeffq, 16*8 + mov dstq, [rsp+16*3] + lea dstq, [dstq+8] + dec r3d + jg .loop + jnc .loop + RET + + +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_16x32_internal_8bpc) +.end: + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r2d, 16 + lea tx2q, [o(.end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly + + +cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*1, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*5, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + mova [coeffq+16*1 ], m0 ;in8 + mova [coeffq+16*5 ], m4 ;in12 + mova [rsp+gprsize+16*13], m2 ;in10 + mova [rsp+gprsize+16*14], m6 ;in14 + mova [rsp+gprsize+16*21], m1 ;in9 + mova [rsp+gprsize+16*24], m3 ;in11 + mova [rsp+gprsize+16*25], m5 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + LOAD_8ROWS coeffq+16*0, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*4, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + mova [rsp+gprsize+16*11], m2 ;in2 + mova [rsp+gprsize+16*12], m6 ;in6 + mova [rsp+gprsize+16*19], m1 ;in1 + mova [rsp+gprsize+16*26], m3 ;in3 + mova [rsp+gprsize+16*23], m5 ;in5 + mova [rsp+gprsize+16*22], m7 ;in7 + + cmp eobd, 150 + jg .full + + mova m1, m4 ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*5 ] ;in12 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [rsp+gprsize+16*11] ;in2 + mova m1, [rsp+gprsize+16*12] ;in6 + mova m2, [rsp+gprsize+16*13] ;in10 + mova m3, [rsp+gprsize+16*14] ;in14 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + jmp .pass2 + +.full: + mova [coeffq+16*0 ], m0 ;in0 + mova [coeffq+16*4 ], m4 ;in4 + + LOAD_8ROWS coeffq+16*2, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*6, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end5: + mova [coeffq+16*2 ], m0 ;in16 + mova [coeffq+16*6 ], m4 ;in20 + mova [rsp+gprsize+16*15], m2 ;in18 + mova [rsp+gprsize+16*16], m6 ;in22 + mova [rsp+gprsize+16*33], m1 ;in17 + mova [rsp+gprsize+16*28], m3 ;in19 + mova [rsp+gprsize+16*29], m5 ;in21 + mova [rsp+gprsize+16*32], m7 ;in23 + + LOAD_8ROWS coeffq+16*3, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*7, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(.pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end6: + SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end7: + mova [rsp+gprsize+16*17], m2 ;in26 + mova [rsp+gprsize+16*18], m6 ;in30 + mova [rsp+gprsize+16*31], m1 ;in25 + mova [rsp+gprsize+16*30], m3 ;in27 + mova [rsp+gprsize+16*27], m5 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + mova m6, m0 ;in24 + mova m7, m4 ;in28 + mova m0, [coeffq+16*0 ] ;in0 + mova m1, [coeffq+16*4 ] ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*5 ] ;in12 + mova m4, [coeffq+16*2 ] ;in16 + mova m5, [coeffq+16*6 ] ;in20 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + LOAD_8ROWS rsp+gprsize+16*11, 16 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main + +.pass2: + mov [rsp+gprsize*1+16*35], eobd + lea r3, [dstq+8] + mov [rsp+gprsize*2+16*35], r3 + lea r3, [o(.end)] + jmp m(idct_8x32_internal_8bpc).end + +.end: + mov dstq, [rsp+gprsize*2+16*35] + mov eobd, [rsp+gprsize*1+16*35] + add coeffq, 16*32 + + mova m0, [coeffq+16*4 ] ;in1 + mova m1, [coeffq+16*12] ;in3 + mova m2, [coeffq+16*20] ;in5 + mova m3, [coeffq+16*28] ;in7 + mova m4, [coeffq+16*5 ] ;in9 + mova m5, [coeffq+16*13] ;in11 + mova m6, [coeffq+16*21] ;in13 + mova m7, [coeffq+16*29] ;in15 + + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mova m0, [coeffq+16*0 ] ;in0 + mova m1, [coeffq+16*16] ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*17] ;in12 + + cmp eobd, 150 + jg .full1 + + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] ;in2 + mova m1, [coeffq+16*24] ;in6 + mova m2, [coeffq+16*9 ] ;in10 + mova m3, [coeffq+16*25] ;in14 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + jmp m(idct_8x32_internal_8bpc).pass2 + +.full1: + mova m4, [coeffq+16*2 ] ;in16 + mova m5, [coeffq+16*18] ;in20 + mova m6, [coeffq+16*3 ] ;in24 + mova m7, [coeffq+16*19] ;in26 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] ;in2 + mova m1, [coeffq+16*24] ;in6 + mova m2, [coeffq+16*9 ] ;in10 + mova m3, [coeffq+16*25] ;in14 + mova m4, [coeffq+16*10] ;in18 + mova m5, [coeffq+16*26] ;in22 + mova m6, [coeffq+16*11] ;in26 + mova m7, [coeffq+16*27] ;in30 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*6 ] ;in17 + mova m1, [coeffq+16*14] ;in19 + mova m2, [coeffq+16*22] ;in21 + mova m3, [coeffq+16*30] ;in23 + mova m4, [coeffq+16*7 ] ;in25 + mova m5, [coeffq+16*15] ;in27 + mova m6, [coeffq+16*23] ;in29 + mova m7, [coeffq+16*31] ;in31 + + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp m(idct_8x32_internal_8bpc).pass2 + + +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_32x16_internal_8bpc) + call m(idct_8x16_internal_8bpc).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*11, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*19, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*27, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r3d, 16 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body + + +cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + add coeffq, 16 + lea r3, [o(.pass1_end1)] +.pass1: + LOAD_8ROWS coeffq+16*0, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*4, 128, 1 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+16*2, 64, 1 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + LOAD_8ROWS coeffq+16*34, 64, 1 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + call m(idct_8x32_internal_8bpc).main + +.pass1_end: + mova [rsp+gprsize+16*0 ], m7 + mov tx2q, r3 + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+16*32, 32 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+16*48, 32 + + sub coeffq, 16 + lea r3, [o(.end)] + jmp .pass1 + +.end: + ret + + +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r4d, eobd + cmp eobd, 43 ;if (eob > 43) + sbb r3d, r3d ; iteration_count++ + cmp r4d, 150 ;if (eob > 150) + sbb r3d, 0 ; iteration_count++ + cmp r4d, 278 ;if (eob > 278) + sbb r3d, -4 ; iteration_count++ + +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea r4, [dstq+8] + mov [rsp+16*3], r4 + mov [rsp+gprsize+16*3], r3d + mov [rsp+gprsize*2+16*3], coeffq + +.loop: + LOAD_8ROWS coeffq, 64, 1 + mova [rsp+16*1], m6 + pxor m6, m6 + REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 + mova [rsp+16*0], m2 + mova [rsp+16*1], m3 + mova [rsp+16*2], m4 + mova m3, [o(pw_1697x16)] + mova m4, [o(pw_16384)] + REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 + mova m2, [o(pw_8192)] + REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1 + mova m2, [rsp+16*0] + mova [rsp+16*0], m7 + IDTX16 2, 7, 3, 4 + mova m7, [rsp+16*2] + mova [rsp+16*2], m5 + IDTX16 7, 5, 3, 4 + mova m5, [rsp+16*1] + mova [rsp+16*1], m6 + pmulhrsw m3, m5 + pmulhrsw m3, m4 + psrlw m4, 1 ; pw_8192 + paddsw m3, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + pmulhrsw m4, m7 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + add coeffq, 16 + dec r3d + jg .loop + mov coeffq, [rsp+gprsize*2+16*3] + add coeffq, 64*8 + mov r3d, [rsp+gprsize+16*3] + xor dstq, dstq + mov [rsp+gprsize+16*3], dstq + mov dstq, [rsp+16*3] + test r3d, r3d + jnz .loop + RET + + +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r4d, 12 ;0100b + mov r5d, 136 ;1000 1000b + cmp eobd, 44 ;if (eob > 43) + cmovns r4d, r5d ; iteration_count+2 + cmp eobd, 151 ;if (eob > 150) + mov r3d, 34952 ;1000 1000 1000 1000b + cmovs r3d, r4d ; iteration_count += 4 + +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea r4, [dstq+8] + mov [rsp+16*3], r4 + +.loop: + LOAD_8ROWS coeffq, 32, 1 + REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + mova [rsp+16*1], m6 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 + mova [rsp+16*1], m5 + mova [rsp+16*2], m6 + mova m6, [o(pw_1697x16)] + REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 + pmulhrsw m7, [o(pw_2048)] + mova m5, [rsp+16*1] + mova [rsp+16*0], m7 + IDTX16 5, 7, 6 + mova m7, [rsp+16*2] + IDTX16 7, 6, 6 + mova m6, [o(pw_2048)] + REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + mova [rsp+16*2], m5 + mova [rsp+16*1], m7 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + pxor m7, m7 + REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + +.loop_end: + add coeffq, 16 + shr r3d, 2 + jz .ret + test r3d, 2 + jnz .loop + mov r4d, r3d + and r4d, 1 + lea coeffq, [coeffq+r4*8+32*7] + mov dstq, [rsp+16*3] + lea r4, [dstq+8] + mov [rsp+16*3], r4 + jmp .loop + +.ret: + RET + + +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_32x32_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 32 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body + + +cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*35], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*35], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*1, 64*2 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov tx2d, [rsp+gprsize*1+16*35] + test tx2d, tx2d + jl .fast + +.full: + LOAD_8ROWS coeffq+64*0, 64*4 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*2, 64*4 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*17, 64*2 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp .pass1_end + +.fast: + mova m0, [coeffq+256*0] + mova m1, [coeffq+256*1] + mova m2, [coeffq+256*2] + mova m3, [coeffq+256*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [coeffq+128*1] + mova m1, [coeffq+128*3] + mova m2, [coeffq+128*5] + mova m3, [coeffq+128*7] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + +.pass1_end: + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end4: + SAVE_8ROWS coeffq+64*24, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + + +.pass2: + mov coeffq, [rsp+gprsize*2+16*35] + mov r3d, 4 + lea tx2q, [o(.pass2_end)] + +.pass2_loop: + mov [rsp+gprsize*3+16*35], r3d + lea r3, [dstq+8] + mov [rsp+gprsize*2+16*35], r3 + + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*12] + mova m2, [coeffq+16*20] + mova m3, [coeffq+16*28] + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*13] + mova m6, [coeffq+16*21] + mova m7, [coeffq+16*29] + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov eobd, [rsp+gprsize*1+16*35] + test eobd, eobd + jl .fast1 + +.full1: + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*16] + mova m2, [coeffq+16*1 ] + mova m3, [coeffq+16*17] + mova m4, [coeffq+16*2 ] + mova m5, [coeffq+16*18] + mova m6, [coeffq+16*3 ] + mova m7, [coeffq+16*19] + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova m4, [coeffq+16*10] + mova m5, [coeffq+16*26] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*27] + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*6 ] + mova m1, [coeffq+16*14] + mova m2, [coeffq+16*22] + mova m3, [coeffq+16*30] + mova m4, [coeffq+16*7 ] + mova m5, [coeffq+16*15] + mova m6, [coeffq+16*23] + mova m7, [coeffq+16*31] + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp tx2q + +.fast1: + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*16] + mova m2, [coeffq+16*1 ] + mova m3, [coeffq+16*17] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + jmp tx2q + +.pass2_end: + lea r3, [o(.pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end + +.pass2_end1: + lea tx2q, [o(.pass2_end)] + add coeffq, 16*32 + mov dstq, [rsp+gprsize*2+16*35] + mov r3d, [rsp+gprsize*3+16*35] + dec r3d + jg .pass2_loop + + ret + + +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 + mov r4d, 2 + cmp eobd, 136 + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + lea r4, [dstq+8] + mov [rsp+gprsize*0+16*3], r4 + mov [rsp+gprsize*1+16*3], r3d + mov [rsp+gprsize*2+16*3], r3d + mov [rsp+gprsize*3+16*3], coeffq + +.loop: + LOAD_8ROWS coeffq, 64 + mova [rsp+16*1], m6 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 + pmulhrsw m7, [o(pw_8192)] + mova [rsp+16*0], m7 + mova m7, [o(pw_8192)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + mova [rsp+16*1], m6 + mova [rsp+16*2], m5 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + + pxor m7, m7 + REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + + add coeffq, 16 + dec r3d + jg .loop + + mov r4d, [rsp+gprsize*2+16*3] + dec r4d + jle .ret + + mov dstq, [rsp+gprsize*0+16*3] + mov coeffq, [rsp+gprsize*3+16*3] + mov [rsp+gprsize*2+16*3], r4 + lea r3, [dstq+8] + add coeffq, 64*8 + mov [rsp+gprsize*0+16*3], r3 + mov r3d, [rsp+gprsize*1+16*3] + mov [rsp+gprsize*3+16*3], coeffq + jmp .loop + +.ret: + RET + + +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_16x64_internal_8bpc) +.end: + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r2d, 32 + lea tx2q, [o(.end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly + + +cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r4d, 2 + sub eobd, 151 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*0, 64*2 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*1, 64*2 + call m(idct_16x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + + mov coeffq, [rsp+gprsize*2+16*67] + mov r3d, 2 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(.end1)] + +.pass2_loop: + mov [rsp+gprsize*3+16*67], r3d + mov eobd, [rsp+gprsize*1+16*67] + + mova m0, [coeffq+16*4 ] ;in1 + mova m1, [coeffq+16*12] ;in3 + mova m2, [coeffq+16*20] ;in5 + mova m3, [coeffq+16*28] ;in7 + mova m4, [coeffq+16*5 ] ;in9 + mova m5, [coeffq+16*13] ;in11 + mova m6, [coeffq+16*21] ;in13 + mova m7, [coeffq+16*29] ;in15 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + pxor m4, m4 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + + test eobd, eobd + jl .fast + +.full: + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + mova m0, [coeffq+16*16] + mova m1, [coeffq+16*17] + mova m2, [coeffq+16*18] + mova m3, [coeffq+16*19] + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova m4, [coeffq+16*10] + mova m5, [coeffq+16*26] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*27] + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*6 ] ;in17 + mova m1, [coeffq+16*14] ;in19 + mova m2, [coeffq+16*22] ;in21 + mova m3, [coeffq+16*30] ;in23 + mova m4, [coeffq+16*7 ] ;in25 + mova m5, [coeffq+16*15] ;in27 + mova m6, [coeffq+16*23] ;in29 + mova m7, [coeffq+16*31] ;in31 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call .main + jmp .end + +.fast: + REPX {mova x, m4}, m2, m3, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + mova m0, [coeffq+16*16] + mova m1, [coeffq+16*17] + + REPX {mova x, m4}, m2, m3, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + + call m(idct_8x32_internal_8bpc).main_veryfast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + call .main_fast + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov r3, r4 + jmp m(idct_8x32_internal_8bpc).end2 + +.end1: + LOAD_8ROWS rsp+gprsize+16*35, 16 + lea dstq, [dstq+strideq*2] + lea r3, [rsp+16*32+gprsize] + call .write + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, [rsp+gprsize*3+16*67] + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(.end1)] + + dec r3d + jg .pass2_loop + ret +.write: + mova [r3+16*0], m7 + mov r4, -16*32 + pxor m7, m7 + sub coeffq, r4 +.zero_loop: + mova [coeffq+r4+16*0], m7 + mova [coeffq+r4+16*1], m7 + add r4, 16*2 + jl .zero_loop + call .write_main2 + LOAD_8ROWS r3+16*11, 16 + call .write_main + LOAD_8ROWS r3+16*19, 16 + call .write_main + LOAD_8ROWS r3+16*27, 16 +.write_main: + mova [r3+16*0], m7 +.write_main2: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [r3+16*0] + mova [r3+16*2], m5 + mova [r3+16*1], m6 + mova [r3+16*0], m7 + WRITE_8X4 0, 1, 2, 3, 5, 6, 7 + lea dstq, [dstq+strideq*2] + WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7 + lea dstq, [dstq+strideq*2] + ret + + +ALIGN function_align +cglobal_label .main_fast + mova m0, [rsp+gprsize*2+16*35] ;in1 + pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63 + pmulhrsw m0, [o(pw_101x8)] ;t32,t33 + mova m7, [o(pd_2048)] + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*66], m3 ;t63 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a + mova [rsp+gprsize*2+16*36], m3 ;t33a + mova [rsp+gprsize*2+16*65], m0 ;t62a + + mova m1, [rsp+gprsize*2+16*37] ;in15 + pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61 + pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35 + mova [rsp+gprsize*2+16*38], m1 ;t35 + mova [rsp+gprsize*2+16*63], m2 ;t60 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a + mova [rsp+gprsize*2+16*37], m2 ;t34a + mova [rsp+gprsize*2+16*64], m1 ;t61a + + mova m0, [rsp+gprsize*2+16*39] ;in9 + pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59 + pmulhrsw m0, [o(pw_897x8)] ;t36,t37 + mova [rsp+gprsize*2+16*39], m0 ;t36 + mova [rsp+gprsize*2+16*62], m3 ;t59 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a + mova [rsp+gprsize*2+16*40], m3 ;t37a + mova [rsp+gprsize*2+16*61], m0 ;t58a + + mova m1, [rsp+gprsize*2+16*41] ;in7 + pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57 + pmulhrsw m1, [o(pw_m700x8)] ;t38,t39 + mova [rsp+gprsize*2+16*42], m1 ;t39 + mova [rsp+gprsize*2+16*59], m2 ;t56 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a + mova [rsp+gprsize*2+16*41], m2 ;t38a + mova [rsp+gprsize*2+16*60], m1 ;t57a + + mova m0, [rsp+gprsize*2+16*43] ;in5 + pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55 + pmulhrsw m0, [o(pw_501x8)] ;t40,t41 + mova [rsp+gprsize*2+16*43], m0 ;t40 + mova [rsp+gprsize*2+16*58], m3 ;t55 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a + mova [rsp+gprsize*2+16*44], m3 ;t41a + mova [rsp+gprsize*2+16*57], m0 ;t54a + + mova m1, [rsp+gprsize*2+16*45] ;in11 + pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53 + pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43 + mova [rsp+gprsize*2+16*46], m1 ;t43 + mova [rsp+gprsize*2+16*55], m2 ;t52 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a + mova [rsp+gprsize*2+16*45], m2 ;t42a + mova [rsp+gprsize*2+16*56], m1 ;t53a + + mova m0, [rsp+gprsize*2+16*47] ;in13 + pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51 + pmulhrsw m0, [o(pw_1285x8)] ;t44,t45 + mova m6, m0 + mova [rsp+gprsize*2+16*54], m3 ;t51 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a + mova [rsp+gprsize*2+16*48], m3 ;t45a + mova [rsp+gprsize*2+16*53], m0 ;t50a + + mova m0, [rsp+gprsize*2+16*49] ;in3 + pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49 + pmulhrsw m0, [o(pw_m301x8)] ;t46,t47 + mova m4, m3 + mova m5, m0 + + jmp .main2 + +ALIGN function_align +cglobal_label .main + mova m0, [rsp+gprsize*2+16*35] ;in1 + mova m1, [rsp+gprsize*2+16*65] ;in31 + pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a + pmulhrsw m0, [o(pw_101x8)] ;t32a + pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a + pmulhrsw m1, [o(pw_m2824x8)] ;t33a + mova m7, [o(pd_2048)] + psubsw m4, m0, m1 ;t33 + paddsw m0, m1 ;t32 + psubsw m5, m3, m2 ;t62 + paddsw m3, m2 ;t63 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*36], m5 ;t33a + mova [rsp+gprsize*2+16*65], m4 ;t62a + mova [rsp+gprsize*2+16*66], m3 ;t63 + + mova m0, [rsp+gprsize*2+16*63] ;in17 + mova m1, [rsp+gprsize*2+16*37] ;in15 + pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a + pmulhrsw m0, [o(pw_1660x8)] ;t34a + pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a + pmulhrsw m1, [o(pw_m1474x8)] ;t35a + psubsw m4, m1, m0 ;t34 + paddsw m0, m1 ;t35 + psubsw m5, m2, m3 ;t61 + paddsw m3, m2 ;t60 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a + mova [rsp+gprsize*2+16*37], m5 ;t34a + mova [rsp+gprsize*2+16*38], m0 ;t35 + mova [rsp+gprsize*2+16*63], m3 ;t60 + mova [rsp+gprsize*2+16*64], m4 ;t61a + + mova m0, [rsp+gprsize*2+16*39] ;in9 + mova m1, [rsp+gprsize*2+16*61] ;in23 + pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a + pmulhrsw m0, [o(pw_897x8)] ;t36a + pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a + pmulhrsw m1, [o(pw_m2191x8)] ;t37a + psubsw m4, m0, m1 ;t37 + paddsw m0, m1 ;t36 + psubsw m5, m3, m2 ;t58 + paddsw m3, m2 ;t59 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a + mova [rsp+gprsize*2+16*39], m0 ;t36 + mova [rsp+gprsize*2+16*40], m5 ;t37a + mova [rsp+gprsize*2+16*61], m4 ;t58a + mova [rsp+gprsize*2+16*62], m3 ;t59 + + mova m0, [rsp+gprsize*2+16*59] ;in25 + mova m1, [rsp+gprsize*2+16*41] ;in7 + pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a + pmulhrsw m0, [o(pw_2359x8)] ;t38a + pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a + pmulhrsw m1, [o(pw_m700x8)] ;t39a + psubsw m4, m1, m0 ;t38 + paddsw m0, m1 ;t39 + psubsw m5, m2, m3 ;t57 + paddsw m3, m2 ;t56 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a + mova [rsp+gprsize*2+16*41], m5 ;t38a + mova [rsp+gprsize*2+16*42], m0 ;t39 + mova [rsp+gprsize*2+16*59], m3 ;t56 + mova [rsp+gprsize*2+16*60], m4 ;t57a + + mova m0, [rsp+gprsize*2+16*43] ;in5 + mova m1, [rsp+gprsize*2+16*57] ;in27 + pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a + pmulhrsw m0, [o(pw_501x8)] ;t40a + pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a + pmulhrsw m1, [o(pw_m2520x8)] ;t41a + psubsw m4, m0, m1 ;t41 + paddsw m0, m1 ;t40 + psubsw m5, m3, m2 ;t54 + paddsw m3, m2 ;t55 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a + mova [rsp+gprsize*2+16*43], m0 ;t40 + mova [rsp+gprsize*2+16*44], m5 ;t41a + mova [rsp+gprsize*2+16*57], m4 ;t54a + mova [rsp+gprsize*2+16*58], m3 ;t55 + + mova m0, [rsp+gprsize*2+16*55] ;in21 + mova m1, [rsp+gprsize*2+16*45] ;in11 + pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a + pmulhrsw m0, [o(pw_2019x8)] ;t42a + pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a + pmulhrsw m1, [o(pw_m1092x8)] ;t43a + psubsw m4, m1, m0 ;t42 + paddsw m0, m1 ;t43 + psubsw m5, m2, m3 ;t53 + paddsw m3, m2 ;t52 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*46], m0 ;t43 + mova [rsp+gprsize*2+16*55], m3 ;t52 + mova [rsp+gprsize*2+16*56], m4 ;t53a + + mova m0, [rsp+gprsize*2+16*47] ;in13 + mova m1, [rsp+gprsize*2+16*53] ;in19 + pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a + pmulhrsw m0, [o(pw_1285x8)] ;t44a + pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a + pmulhrsw m1, [o(pw_m1842x8)] ;t45a + psubsw m4, m0, m1 ;t45 + paddsw m0, m1 ;t44 + psubsw m5, m3, m2 ;t50 + paddsw m3, m2 ;t51 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a + mova m6, m0 + mova [rsp+gprsize*2+16*48], m5 ;t45a + mova [rsp+gprsize*2+16*53], m4 ;t50a + mova [rsp+gprsize*2+16*54], m3 ;t51 + + mova m0, [rsp+gprsize*2+16*51] ;in29 + mova m1, [rsp+gprsize*2+16*49] ;in3 + pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a + pmulhrsw m0, [o(pw_2675x8)] ;t46a + pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a + pmulhrsw m1, [o(pw_m301x8)] ;t47a + psubsw m5, m1, m0 ;t46 + paddsw m0, m1 ;t47 + psubsw m4, m2, m3 ;t49 + paddsw m3, m2 ;t48 + +ALIGN function_align +.main2: + ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a + mova m1, [rsp+gprsize*2+16*54] ;t51 + psubsw m2, m0, m6 ;t44a + paddsw m0, m6 ;t47a + psubsw m6, m3, m1 ;t51a + paddsw m3, m1 ;t48a + mova [rsp+gprsize*2+16*50], m0 ;t47a + mova [rsp+gprsize*2+16*51], m3 ;t48a + ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51 + mova [rsp+gprsize*2+16*47], m6 ;t44 + mova [rsp+gprsize*2+16*54], m2 ;t51 + + mova m0, [rsp+gprsize*2+16*48] ;t45a + mova m3, [rsp+gprsize*2+16*53] ;t50a + psubsw m2, m4, m0 ;t45 + paddsw m4, m0 ;t46 + psubsw m6, m5, m3 ;t50 + paddsw m5, m3 ;t49 + ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a + mova [rsp+gprsize*2+16*48], m6 ;t45a + mova [rsp+gprsize*2+16*49], m4 ;t46 + mova [rsp+gprsize*2+16*52], m5 ;t49 + mova [rsp+gprsize*2+16*53], m2 ;t50a + + mova m0, [rsp+gprsize*2+16*43] ;t40 + mova m2, [rsp+gprsize*2+16*46] ;t43 + mova m3, [rsp+gprsize*2+16*55] ;t52 + mova m1, [rsp+gprsize*2+16*58] ;t55 + psubsw m4, m0, m2 ;t43a + paddsw m0, m2 ;t40a + psubsw m5, m1, m3 ;t52a + paddsw m1, m3 ;t55a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52 + mova [rsp+gprsize*2+16*43], m0 ;t40a + mova [rsp+gprsize*2+16*46], m5 ;t43 + mova [rsp+gprsize*2+16*55], m4 ;t52 + mova [rsp+gprsize*2+16*58], m1 ;t55a + + mova m0, [rsp+gprsize*2+16*44] ;t41a + mova m2, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*57] ;t54a + psubsw m4, m0, m2 ;t42 + paddsw m0, m2 ;t41 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t54 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a + mova [rsp+gprsize*2+16*44], m0 ;t41 + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*56], m4 ;t53a + mova [rsp+gprsize*2+16*57], m1 ;t54 + + mova m0, [rsp+gprsize*2+16*41] ;t38a + mova m2, [rsp+gprsize*2+16*40] ;t37a + mova m3, [rsp+gprsize*2+16*61] ;t58a + mova m1, [rsp+gprsize*2+16*60] ;t57a + psubsw m4, m0, m2 ;t37 + paddsw m0, m2 ;t38 + psubsw m5, m1, m3 ;t58 + paddsw m1, m3 ;t57 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a + mova [rsp+gprsize*2+16*41], m0 ;t38 + mova [rsp+gprsize*2+16*40], m5 ;t37a + mova [rsp+gprsize*2+16*61], m4 ;t58a + mova [rsp+gprsize*2+16*60], m1 ;t57 + + mova m0, [rsp+gprsize*2+16*42] ;t39 + mova m2, [rsp+gprsize*2+16*39] ;t36 + mova m3, [rsp+gprsize*2+16*62] ;t59 + mova m1, [rsp+gprsize*2+16*59] ;t56 + psubsw m4, m0, m2 ;t36a + paddsw m0, m2 ;t39a + psubsw m5, m1, m3 ;t59a + paddsw m1, m3 ;t56a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59 + mova [rsp+gprsize*2+16*42], m0 ;t39a + mova [rsp+gprsize*2+16*39], m5 ;t36 + mova [rsp+gprsize*2+16*62], m4 ;t59 + mova [rsp+gprsize*2+16*59], m1 ;t56a + + mova m0, [rsp+gprsize*2+16*35] ;t32 + mova m2, [rsp+gprsize*2+16*38] ;t35 + mova m3, [rsp+gprsize*2+16*63] ;t60 + mova m1, [rsp+gprsize*2+16*66] ;t63 + psubsw m4, m0, m2 ;t35a + paddsw m0, m2 ;t32a + psubsw m5, m1, m3 ;t60a + paddsw m1, m3 ;t63a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60 + mova [rsp+gprsize*2+16*35], m0 ;t32a + mova [rsp+gprsize*2+16*38], m5 ;t35 + mova [rsp+gprsize*2+16*63], m4 ;t60 + mova [rsp+gprsize*2+16*66], m1 ;t63a + + mova m0, [rsp+gprsize*2+16*36] ;t33a + mova m2, [rsp+gprsize*2+16*37] ;t34a + mova m3, [rsp+gprsize*2+16*64] ;t61a + mova m1, [rsp+gprsize*2+16*65] ;t62a + psubsw m4, m0, m2 ;t34 + paddsw m0, m2 ;t33 + psubsw m5, m1, m3 ;t61 + paddsw m1, m3 ;t62 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a + + mova m2, [rsp+gprsize*2+16*41] ;t38 + mova m3, [rsp+gprsize*2+16*60] ;t57 + psubsw m6, m0, m2 ;t38a + paddsw m0, m2 ;t33a + psubsw m2, m1, m3 ;t57a + paddsw m1, m3 ;t62a + mova [rsp+gprsize*2+16*36], m0 ;t33a + mova [rsp+gprsize*2+16*65], m1 ;t62a + ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57 + mova [rsp+gprsize*2+16*41], m2 ;t38 + mova [rsp+gprsize*2+16*60], m6 ;t57 + + mova m2, [rsp+gprsize*2+16*40] ;t37 + mova m3, [rsp+gprsize*2+16*61] ;t58 + psubsw m0, m5, m2 ;t37 + paddsw m5, m2 ;t34 + psubsw m1, m4, m3 ;t58 + paddsw m4, m3 ;t61 + ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a + mova [rsp+gprsize*2+16*37], m5 ;t34 + mova [rsp+gprsize*2+16*64], m4 ;t61 + mova [rsp+gprsize*2+16*40], m1 ;t37a + mova [rsp+gprsize*2+16*61], m0 ;t58a + + mova m0, [rsp+gprsize*2+16*38] ;t35 + mova m2, [rsp+gprsize*2+16*39] ;t36 + mova m3, [rsp+gprsize*2+16*62] ;t59 + mova m1, [rsp+gprsize*2+16*63] ;t60 + psubsw m4, m0, m2 ;t36a + paddsw m0, m2 ;t35a + psubsw m5, m1, m3 ;t59a + paddsw m1, m3 ;t60a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59 + mova [rsp+gprsize*2+16*38], m0 ;t35a + mova [rsp+gprsize*2+16*39], m5 ;t36 + mova [rsp+gprsize*2+16*62], m4 ;t59 + mova [rsp+gprsize*2+16*63], m1 ;t60a + + mova m0, [rsp+gprsize*2+16*35] ;t32a + mova m2, [rsp+gprsize*2+16*42] ;t39a + mova m3, [rsp+gprsize*2+16*59] ;t56a + mova m1, [rsp+gprsize*2+16*66] ;t63a + psubsw m4, m0, m2 ;t39 + paddsw m0, m2 ;t32 + psubsw m5, m1, m3 ;t56 + paddsw m1, m3 ;t63 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*42], m5 ;t39a + mova [rsp+gprsize*2+16*59], m4 ;t56a + mova [rsp+gprsize*2+16*66], m1 ;t63 + + mova m0, [rsp+gprsize*2+16*50] ;t47a + mova m2, [rsp+gprsize*2+16*43] ;t40a + mova m3, [rsp+gprsize*2+16*58] ;t55a + mova m1, [rsp+gprsize*2+16*51] ;t48a + psubsw m4, m0, m2 ;t40 + paddsw m0, m2 ;t47 + psubsw m5, m1, m3 ;t55 + paddsw m1, m3 ;t48 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a + mova [rsp+gprsize*2+16*50], m0 ;t47 + mova [rsp+gprsize*2+16*43], m5 ;t40a + mova [rsp+gprsize*2+16*58], m4 ;t55a + mova [rsp+gprsize*2+16*51], m1 ;t48 + + mova m0, [rsp+gprsize*2+16*49] ;t46 + mova m2, [rsp+gprsize*2+16*44] ;t41 + mova m3, [rsp+gprsize*2+16*57] ;t54 + mova m1, [rsp+gprsize*2+16*52] ;t49 + psubsw m4, m0, m2 ;t41a + paddsw m0, m2 ;t46a + psubsw m5, m1, m3 ;t54a + paddsw m1, m3 ;t49a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54 + mova [rsp+gprsize*2+16*49], m0 ;t46a + mova [rsp+gprsize*2+16*44], m5 ;t41 + mova [rsp+gprsize*2+16*57], m4 ;t54 + mova [rsp+gprsize*2+16*52], m1 ;t49a + + mova m0, [rsp+gprsize*2+16*48] ;t45a + mova m2, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*53] ;t50a + psubsw m4, m0, m2 ;t42 + paddsw m0, m2 ;t45 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t50 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a + mova [rsp+gprsize*2+16*48], m0 ;t45 + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*56], m4 ;t53a + mova [rsp+gprsize*2+16*53], m1 ;t50 + + mova m0, [rsp+gprsize*2+16*47] ;t44 + mova m2, [rsp+gprsize*2+16*46] ;t43 + mova m3, [rsp+gprsize*2+16*55] ;t52 + mova m1, [rsp+gprsize*2+16*54] ;t51 + psubsw m4, m0, m2 ;t43a + paddsw m0, m2 ;t44a + psubsw m5, m1, m3 ;t52a + paddsw m1, m3 ;t51a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52 + + mova m2, [rsp+gprsize*2+16*38] ;t35a + mova m3, [rsp+gprsize*2+16*31] ;tmp[28] + psubsw m6, m2, m0 ;t44 + paddsw m2, m0 ;t35 + psubsw m0, m3, m2 ;out35 + paddsw m2, m3 ;out28 + mova m3, [rsp+gprsize*2+16*63] ;t60a + mova [rsp+gprsize*2+16*38], m0 ;out35 + mova [rsp+gprsize*2+16*31], m2 ;out28 + psubsw m0, m3, m1 ;t51 + paddsw m3, m1 ;t60 + ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a + mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3] + psubsw m1, m2, m3 ;out60 + paddsw m2, m3 ;out3 + mova m3, [rsp+gprsize*2+16*22] ;tmp[19] + mova [rsp+gprsize*2+16*63], m1 ;out60 + mova [rsp+gprsize*2+16*6 ], m2 ;out3 + psubsw m1, m3, m0 ;out44 + paddsw m3, m0 ;out19 + mova m2, [rsp+gprsize*2+16*15] ;tmp[12] + + mova m0, [rsp+gprsize*2+16*39] ;t36 + mova [rsp+gprsize*2+16*47], m1 ;out44 + mova [rsp+gprsize*2+16*22], m3 ;out19 + mova m1, [rsp+gprsize*2+16*62] ;t59 + psubsw m3, m2, m6 ;out51 + paddsw m2, m6 ;out12 + mova [rsp+gprsize*2+16*54], m3 ;out51 + mova [rsp+gprsize*2+16*15], m2 ;out12 + psubsw m2, m0, m5 ;t43a + paddsw m0, m5 ;t36a + mova m5, [rsp+gprsize*2+16*30] ;tmp[27] + psubsw m3, m1, m4 ;t52a + paddsw m1, m4 ;t59a + ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52 + mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ] + psubsw m6, m5, m0 ;out36 + paddsw m5, m0 ;out27 + psubsw m0, m4, m1 ;out59 + paddsw m4, m1 ;out4 + mova [rsp+gprsize*2+16*39], m6 ;out36 + mova [rsp+gprsize*2+16*30], m5 ;out27 + mova [rsp+gprsize*2+16*62], m0 ;out59 + mova [rsp+gprsize*2+16*7 ], m4 ;out4 + mova m0, [rsp+gprsize*2+16*23] ;tmp[20] + mova m5, [rsp+gprsize*2+16*14] ;tmp[11] + psubsw m4, m0, m3 ;out43 + paddsw m0, m3 ;out20 + psubsw m6, m5, m2 ;out52 + paddsw m5, m2 ;out11 + mova [rsp+gprsize*2+16*46], m4 ;out43 + mova [rsp+gprsize*2+16*23], m0 ;out20 + mova [rsp+gprsize*2+16*55], m6 ;out52 + mova [rsp+gprsize*2+16*14], m5 ;out11 + + mova m0, [rsp+gprsize*2+16*40] ;t37a + mova m5, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*61] ;t58a + mova m2, [rsp+gprsize*2+16*29] ;tmp[26] + psubsw m4, m0, m5 ;t42 + paddsw m0, m5 ;t37 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t58 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52 + mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ] + psubsw m6, m2, m0 ;out37 + paddsw m2, m0 ;out26 + psubsw m0, m3, m1 ;out58 + paddsw m3, m1 ;out5 + mova [rsp+gprsize*2+16*40], m6 ;out37 + mova [rsp+gprsize*2+16*29], m2 ;out26 + mova [rsp+gprsize*2+16*61], m0 ;out58 + mova [rsp+gprsize*2+16*8 ], m3 ;out5 + mova m0, [rsp+gprsize*2+16*24] ;tmp[21] + mova m1, [rsp+gprsize*2+16*13] ;tmp[10] + psubsw m2, m0, m5 ;out42 + paddsw m0, m5 ;out21 + psubsw m3, m1, m4 ;out53 + paddsw m1, m4 ;out10 + mova [rsp+gprsize*2+16*45], m2 ;out42 + mova [rsp+gprsize*2+16*24], m0 ;out21 + mova [rsp+gprsize*2+16*56], m3 ;out53 + mova [rsp+gprsize*2+16*13], m1 ;out10 + + mova m0, [rsp+gprsize*2+16*41] ;t38 + mova m5, [rsp+gprsize*2+16*44] ;t41 + mova m3, [rsp+gprsize*2+16*57] ;t54 + mova m1, [rsp+gprsize*2+16*60] ;t57 + mova m2, [rsp+gprsize*2+16*28] ;tmp[25] + psubsw m4, m0, m5 ;t41a + paddsw m0, m5 ;t38a + psubsw m5, m1, m3 ;t54a + paddsw m1, m3 ;t57a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a + mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ] + psubsw m6, m2, m0 ;out38 + paddsw m2, m0 ;out25 + psubsw m0, m3, m1 ;out57 + paddsw m3, m1 ;out6 + mova [rsp+gprsize*2+16*41], m6 ;out38 + mova [rsp+gprsize*2+16*28], m2 ;out25 + mova [rsp+gprsize*2+16*60], m0 ;out57 + mova [rsp+gprsize*2+16*9 ], m3 ;out6 + mova m0, [rsp+gprsize*2+16*25] ;tmp[22] + mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ] + psubsw m2, m0, m5 ;out41 + paddsw m0, m5 ;out22 + psubsw m3, m1, m4 ;out54 + paddsw m1, m4 ;out9 + mova [rsp+gprsize*2+16*44], m2 ;out41 + mova [rsp+gprsize*2+16*25], m0 ;out22 + mova [rsp+gprsize*2+16*57], m3 ;out54 + mova [rsp+gprsize*2+16*12], m1 ;out9 + + mova m0, [rsp+gprsize*2+16*42] ;t39a + mova m5, [rsp+gprsize*2+16*43] ;t40a + mova m3, [rsp+gprsize*2+16*58] ;t55a + mova m1, [rsp+gprsize*2+16*59] ;t56a + mova m2, [rsp+gprsize*2+16*27] ;tmp[24] + psubsw m4, m0, m5 ;t40 + paddsw m0, m5 ;t39 + psubsw m5, m1, m3 ;t55 + paddsw m1, m3 ;t56 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a + mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ] + psubsw m6, m2, m0 ;out39 + paddsw m2, m0 ;out24 + psubsw m0, m3, m1 ;out56 + paddsw m3, m1 ;out7 + mova [rsp+gprsize*2+16*42], m6 ;out39 + mova [rsp+gprsize*2+16*27], m2 ;out24 + mova [rsp+gprsize*2+16*59], m0 ;out56 + mova [rsp+gprsize*2+16*10], m3 ;out7 + mova m0, [rsp+gprsize*2+16*26] ;tmp[23] + mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ] + psubsw m2, m0, m5 ;out40 + paddsw m0, m5 ;out23 + psubsw m3, m1, m4 ;out55 + paddsw m1, m4 ;out8 + mova [rsp+gprsize*2+16*43], m2 ;out40 + mova [rsp+gprsize*2+16*26], m0 ;out23 + mova [rsp+gprsize*2+16*58], m3 ;out55 + mova [rsp+gprsize*2+16*11], m1 ;out8 + + mova m0, [rsp+gprsize*2+16*37] ;t34 + mova m5, [rsp+gprsize*2+16*48] ;t45 + mova m3, [rsp+gprsize*2+16*53] ;t50 + mova m1, [rsp+gprsize*2+16*64] ;t61 + mova m2, [rsp+gprsize*2+16*32] ;tmp[29] + psubsw m4, m0, m5 ;t45a + paddsw m0, m5 ;t34a + psubsw m5, m1, m3 ;t50a + paddsw m1, m3 ;t61a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 + mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ] + psubsw m6, m2, m0 ;out34 + paddsw m2, m0 ;out29 + psubsw m0, m3, m1 ;out61 + paddsw m3, m1 ;out2 + mova [rsp+gprsize*2+16*37], m6 ;out34 + mova [rsp+gprsize*2+16*32], m2 ;out29 + mova [rsp+gprsize*2+16*64], m0 ;out61 + mova [rsp+gprsize*2+16*5 ], m3 ;out2 + mova m0, [rsp+gprsize*2+16*21] ;tmp[18] + mova m1, [rsp+gprsize*2+16*16] ;tmp[13] + psubsw m2, m0, m5 ;out45 + paddsw m0, m5 ;out18 + psubsw m3, m1, m4 ;out50 + paddsw m1, m4 ;out13 + mova [rsp+gprsize*2+16*48], m2 ;out45 + mova [rsp+gprsize*2+16*21], m0 ;out18 + mova [rsp+gprsize*2+16*53], m3 ;out50 + mova [rsp+gprsize*2+16*16], m1 ;out13 + + mova m0, [rsp+gprsize*2+16*36] ;t33a + mova m5, [rsp+gprsize*2+16*49] ;t46a + mova m3, [rsp+gprsize*2+16*52] ;t49a + mova m1, [rsp+gprsize*2+16*65] ;t62a + mova m2, [rsp+gprsize*2+16*33] ;tmp[30] + psubsw m4, m0, m5 ;t46 + paddsw m0, m5 ;t33 + psubsw m5, m1, m3 ;t49 + paddsw m1, m3 ;t62 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 + mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ] + psubsw m6, m2, m0 ;out33 + paddsw m2, m0 ;out30 + psubsw m0, m3, m1 ;out62 + paddsw m3, m1 ;out1 + mova [rsp+gprsize*2+16*36], m6 ;out33 + mova [rsp+gprsize*2+16*33], m2 ;out30 + mova [rsp+gprsize*2+16*65], m0 ;out62 + mova [rsp+gprsize*2+16*4 ], m3 ;out1 + mova m0, [rsp+gprsize*2+16*20] ;tmp[17] + mova m1, [rsp+gprsize*2+16*17] ;tmp[14] + psubsw m2, m0, m5 ;out46 + paddsw m0, m5 ;out17 + psubsw m3, m1, m4 ;out49 + paddsw m1, m4 ;out14 + mova [rsp+gprsize*2+16*49], m2 ;out46 + mova [rsp+gprsize*2+16*20], m0 ;out17 + mova [rsp+gprsize*2+16*52], m3 ;out49 + mova [rsp+gprsize*2+16*17], m1 ;out14 + + mova m0, [rsp+gprsize*2+16*35] ;t32 + mova m5, [rsp+gprsize*2+16*50] ;t47 + mova m3, [rsp+gprsize*2+16*51] ;t48 + mova m1, [rsp+gprsize*2+16*66] ;t63 + mova m2, [rsp+gprsize*2+16*34] ;tmp[31] + psubsw m4, m0, m5 ;t47a + paddsw m0, m5 ;t32a + psubsw m5, m1, m3 ;t48a + paddsw m1, m3 ;t63a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48 + mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ] + psubsw m6, m2, m0 ;out32 + paddsw m2, m0 ;out31 + psubsw m0, m3, m1 ;out63 + paddsw m3, m1 ;out0 + mova [rsp+gprsize*2+16*35], m6 ;out32 + mova [rsp+gprsize*2+16*34], m2 ;out31 + mova [rsp+gprsize*2+16*66], m0 ;out63 + mova [rsp+gprsize*2+16*3 ], m3 ;out0 + mova m0, [rsp+gprsize*2+16*19] ;tmp[16] + mova m1, [rsp+gprsize*2+16*18] ;tmp[15] + psubsw m2, m0, m5 ;out47 + paddsw m0, m5 ;out16 + psubsw m3, m1, m4 ;out48 + paddsw m1, m4 ;out15 + mova [rsp+gprsize*2+16*50], m2 ;out47 + mova [rsp+gprsize*2+16*19], m0 ;out16 + mova [rsp+gprsize*2+16*51], m3 ;out48 + mova [rsp+gprsize*2+16*18], m1 ;out15 + ret + + +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_64x16_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 16 + lea tx2q, [o(.end)] + +.body: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m7, m7 + +.loop: + mova m1, [dstq+16*0] + mova m3, [dstq+16*1] + mova m5, [dstq+16*2] + mova m6, [dstq+16*3] + punpckhbw m2, m1, m7 + punpcklbw m1, m7 + punpckhbw m4, m3, m7 + punpcklbw m3, m7 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + punpckhbw m2, m5, m7 + punpcklbw m5, m7 + punpckhbw m4, m6, m7 + punpcklbw m6, m7 + paddw m2, m0 + paddw m5, m0 + paddw m4, m0 + paddw m6, m0 + packuswb m5, m2 + packuswb m6, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m3 + mova [dstq+16*2], m5 + mova [dstq+16*3], m6 + add dstq, strideq + dec r3d + jg .loop + jmp tx2q + +.end: + RET + + +%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 + +%if %3 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [%1+%2*0] + pmulhrsw m1, m3, [%1+%2*1] + pmulhrsw m2, m3, [%1+%2*2] + pmulhrsw m3, [%1+%2*3] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] +%endif +%endmacro + +%macro LOAD_4ROWS_H 2 ;src, stride + mova m4, [%1+%2*0] + mova m5, [%1+%2*1] + mova m6, [%1+%2*2] + mova m7, [%1+%2*3] +%endmacro + +cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r3d, 2 + mov [rsp+gprsize*2+16*67], dstq + lea dstq, [rsp+gprsize+16*68] + +.pass1_loop: + LOAD_4ROWS coeffq+32*0, 32*8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+32*4, 32*8 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+32*2, 32*4 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+32*1, 32*2 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+32*17, 32*2 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal_8bpc).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+32*0, 32 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+32*8, 32 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+32*16, 32 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+32*24, 32 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end4: + SAVE_8ROWS dstq+32*0, 32 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end5: + SAVE_8ROWS dstq+32*8, 32 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end6: + SAVE_8ROWS dstq+32*16, 32 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end7: + SAVE_8ROWS dstq+32*24, 32 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov dstq, [rsp+gprsize*2+16*67] + sub coeffq, 32 + mov r3d, 4 + +.pass2_loop: + mov [rsp+gprsize*1+16*67], r3d + + LOAD_4ROWS coeffq+16*0, 32*2 + LOAD_4ROWS_H coeffq+16*1, 32*2 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+16*2, 32*2 + LOAD_4ROWS_H coeffq+16*3, 32*2 + call m(idct_16x8_internal_8bpc).main + + mov r3, dstq + lea tx2q, [o(.end)] + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.end1)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 16*16 + mov r3d, [rsp+gprsize*1+16*67] + mov dstq, [rsp+gprsize*2+16*67] + add dstq, 8 + mov [rsp+gprsize*2+16*67], dstq + dec r3d + jg .pass2_loop + + mov r3d, 4 + lea coeffq, [rsp+gprsize+16*68] +.pass2_loop2: + mov [rsp+gprsize*1+16*67], r3d + + LOAD_4ROWS coeffq+16*0, 32*2 + LOAD_4ROWS_H coeffq+16*1, 32*2 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+16*2, 32*2 + LOAD_4ROWS_H coeffq+16*3, 32*2 + call m(idct_16x8_internal_8bpc).main + + mov r3, dstq + lea tx2q, [o(.end2)] + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal_8bpc).end + +.end2: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.end3)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end3: + + add coeffq, 16*16 + mov r3d, [rsp+gprsize*1+16*67] + mov dstq, [rsp+gprsize*2+16*67] + add dstq, 8 + mov [rsp+gprsize*2+16*67], dstq + dec r3d + jg .pass2_loop2 + ret + + +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_32x64_internal_8bpc) +.end: + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r3d, 64 + lea tx2q, [o(.end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body + + +cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*1, 64*2, 1 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov tx2d, [rsp+gprsize*1+16*67] + test tx2d, tx2d + jl .fast + +.full: + LOAD_8ROWS coeffq+64*0, 64*4, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*2, 64*4, 1 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*17, 64*2, 1 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp .pass1_end + +.fast: + LOAD_4ROWS coeffq, 256, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+128*1, 256, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + +.pass1_end: + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+64*24, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov coeffq, [rsp+gprsize*2+16*67] + mov r3d, 4 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + jmp m(idct_16x64_internal_8bpc).pass2_loop + + +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_64x32_internal_8bpc) +.end: + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + pmulhrsw m0, m1 + mov [coeffq], eobd + mov r3d, 32 + lea tx2q, [o(.end)] + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body + + +cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + mov [rsp+gprsize*3+16*67], dstq + lea dstq, [rsp+gprsize+16*69] + mov [rsp+gprsize*4+16*67], dstq + +.pass1_loop: + LOAD_4ROWS coeffq+64*0, 64*8, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+64*4, 64*8, 1 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*2, 64*4, 1 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+64*1, 64*2, 1 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+64*17, 64*2, 1 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal_8bpc).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+64*24, 64 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS dstq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end5: + SAVE_8ROWS dstq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end6: + SAVE_8ROWS dstq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end7: + SAVE_8ROWS dstq+64*24, 64 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov coeffq, [rsp+gprsize*4+16*67] + mov dstq, [rsp+gprsize*3+16*67] + mov eobd, [rsp+gprsize*1+16*67] + lea dstq, [dstq+32] + mov [rsp+gprsize*1+16*35], eobd + lea tx2q, [o(.pass2_end)] + mov r3d, 4 + jmp m(idct_32x32_internal_8bpc).pass2_loop + +.pass2_end: + mova [rsp+gprsize+16*0], m7 + lea r3, [o(.pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end2 + +.pass2_end1: + lea tx2q, [o(.pass2_end)] + add coeffq, 16*32 + mov dstq, [rsp+gprsize*2+16*35] + mov r3d, [rsp+gprsize*3+16*35] + dec r3d + jg m(idct_32x32_internal_8bpc).pass2_loop + +.pass2_end2: + mov dstq, [rsp+gprsize*3+16*67] + mov coeffq, [rsp+gprsize*2+16*67] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] + mov r3d, 4 + jmp m(idct_32x32_internal_8bpc).pass2_loop + + +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_64x64_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 64 + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body + +cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r5d, 4 + mov r4d, 2 + sub eobd, 136 + cmovns r4d, r5d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*1+16*67], eobd + mov r3d, r4d + mov [rsp+gprsize*4+16*67], coeffq + mov [rsp+gprsize*3+16*67], dstq + lea dstq, [rsp+gprsize+16*69] + mov [rsp+gprsize*2+16*67], dstq + +.pass1_loop: + LOAD_4ROWS coeffq+64*0, 64*8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+64*4, 64*8 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*2, 64*4 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+64*1, 64*2 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+64*17, 64*2 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal_8bpc).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+64*24, 64 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end4: + SAVE_8ROWS dstq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end5: + SAVE_8ROWS dstq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end6: + SAVE_8ROWS dstq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end7: + SAVE_8ROWS dstq+64*24, 64 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov dstq, [rsp+gprsize*3+16*67] + mov coeffq, [rsp+gprsize*2+16*67] + lea dstq, [dstq+32] + mov r3d, 4 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(.pass2_end)] + jmp m(idct_16x64_internal_8bpc).pass2_loop + +.pass2_end: + LOAD_8ROWS rsp+gprsize+16*35, 16 + lea dstq, [dstq+strideq*2] + lea r3, [rsp+16*32+gprsize] + mova [rsp+gprsize+16*0], m7 + call m(idct_16x64_internal_8bpc).write + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, [rsp+gprsize*3+16*67] + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(.pass2_end)] + + dec r3d + jg m(idct_16x64_internal_8bpc).pass2_loop + +.pass2_end2: + mov coeffq, [rsp+gprsize*4+16*67] + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, 4 + sub dstq, 72 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + jmp m(idct_16x64_internal_8bpc).pass2_loop diff --git a/third_party/dav1d/src/x86/loopfilter.h b/third_party/dav1d/src/x86/loopfilter.h new file mode 100644 index 0000000000..33c842a9ce --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter.h @@ -0,0 +1,66 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/loopfilter.h" + +#define decl_loopfilter_sb_fns(ext) \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, ext)); \ +decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, ext)) + +decl_loopfilter_sb_fns(ssse3); +decl_loopfilter_sb_fns(avx2); +decl_loopfilter_sb_fns(avx512icl); + +static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, ssse3); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, ssse3); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, ssse3); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, ssse3); + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx2); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx2); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx2); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, avx512icl); + c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, avx512icl); + c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, avx512icl); + c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, avx512icl); +#endif +} diff --git a/third_party/dav1d/src/x86/loopfilter16_avx2.asm b/third_party/dav1d/src/x86/loopfilter16_avx2.asm new file mode 100644 index 0000000000..ed83000ac2 --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter16_avx2.asm @@ -0,0 +1,1161 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 +pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 + times 4 db 8, 9 + times 4 db 0, 1 + times 4 db 8, 9 + +pw_1: times 16 dw 1 +pw_2: times 16 dw 2 +pw_3: times 16 dw 3 +pw_4096: times 2 dw 4096 + +; 10bpc/12bpc: +pw_4: times 2 dw 4 + times 2 dw 16 +clip_max: times 2 dw 511 + times 2 dw 2047 +clip_min: times 2 dw -512 + times 2 dw -2048 + +SECTION .text + +; in: out: +; mm%1 a b c d a e i m +; mm%2 e f g h b f j n +; mm%3 i j k l -> c g k o +; mm%4 m n o p d h l p +%macro TRANSPOSE4X4W 5 + punpcklwd m%5, m%1, m%2 + punpckhwd m%1, m%2 + punpcklwd m%2, m%3, m%4 + punpckhwd m%3, m%4 + punpckldq m%4, m%5, m%2 + punpckhdq m%5, m%2 + punpckldq m%2, m%1, m%3 + punpckhdq m%1, m%3 + + SWAP %1, %4 + SWAP %2, %5, %3 +%endmacro + +; in: out: +; xmm%1 a b c d e f g h a i q y 6 E M U +; xmm%2 i j k l m n o p b j r z 7 F N V +; xmm%3 q r s t u v w x c k s 0 8 G O W +; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X +; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y +; xmm%6 E F G H I J K L f n v 3 B J R Z +; xmm%7 M N O P Q R S T g o w 4 C K S + +; xmm%8 U V W X Y Z + = h p x 5 D L T = +%macro TRANSPOSE8X8W 9 + ; xmm%1 a b c d e f g h a i q y b j r z + ; xmm%2 i j k l m n o p c k s 0 d l t 1 + ; xmm%3 q r s t u v w x -> e m u 2 f n v 3 + ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 + TRANSPOSE4X4W %1, %2, %3, %4, %9 + + ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V + ; xmm%6 E F G H I J K L 8 G O W 9 H P X + ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z + ; xmm%8 U V W X Y Z + = C K S + D L T = + TRANSPOSE4X4W %5, %6, %7, %8, %9 + + ; xmm%1 a i q y b j r z a i q y 6 E M U + ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V + ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W + ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X + ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y + ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z + ; xmm%7 A I Q Y B J R Z g o w 4 C K S + + ; xmm%8 C K S + D L T = h p x 5 D L T = + punpckhqdq m%9, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + punpckhqdq m%7, m%4, m%8 + punpcklqdq m%4, m%8 + + SWAP %8, %7, %4, %5, %3, %2, %9 +%endmacro + +; transpose and write m3-6, everything else is scratch +%macro TRANSPOSE_8x4_AND_WRITE_4x16 0 + ; transpose 8x4 + punpcklwd m0, m3, m4 + punpckhwd m3, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckldq m6, m0, m4 + punpckhdq m0, m4 + punpckldq m4, m3, m5 + punpckhdq m3, m5 + + ; write out + movq [dstq+strideq*0-4], xm6 + movhps [dstq+strideq*1-4], xm6 + movq [dstq+strideq*2-4], xm0 + movhps [dstq+stride3q -4], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm4 + movhps [dstq+strideq*1-4], xm4 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + + vextracti128 xm6, m6, 1 + vextracti128 xm0, m0, 1 + vextracti128 xm4, m4, 1 + vextracti128 xm3, m3, 1 + + movq [dstq+strideq*0-4], xm6 + movhps [dstq+strideq*1-4], xm6 + movq [dstq+strideq*2-4], xm0 + movhps [dstq+stride3q -4], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm4 + movhps [dstq+strideq*1-4], xm4 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + mova m3, [tmpq+strideq*0] ; p1 + mova m4, [tmpq+strideq*1] ; p0 + mova m5, [tmpq+strideq*2] ; q0 + mova m6, [tmpq+stride3q] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] + ; we load p3 later + mova m13, [tmpq+strideq*1] + mova m3, [tmpq+strideq*2] + mova m4, [tmpq+stride3q] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q] +%endif +%endif +%else + ; load lines +%if %1 == 4 + movq xm3, [dstq+strideq*0-4] + movq xm4, [dstq+strideq*1-4] + movq xm5, [dstq+strideq*2-4] + movq xm6, [dstq+stride3q -4] + lea tmpq, [dstq+strideq*4] + movq xm11, [tmpq+strideq*0-4] + movq xm13, [tmpq+strideq*1-4] + movq xm14, [tmpq+strideq*2-4] + movq xm15, [tmpq+stride3q -4] + lea tmpq, [tmpq+strideq*4] + ; this overreads by 8 bytes but the buffers are padded + ; so that should be ok + vinserti128 m3, [tmpq+strideq*0-4], 1 + vinserti128 m4, [tmpq+strideq*1-4], 1 + vinserti128 m5, [tmpq+strideq*2-4], 1 + vinserti128 m6, [tmpq+stride3q -4], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m11, [tmpq+strideq*0-4], 1 + vinserti128 m13, [tmpq+strideq*1-4], 1 + vinserti128 m14, [tmpq+strideq*2-4], 1 + vinserti128 m15, [tmpq+stride3q -4], 1 + + ; transpose 4x8 + ; xm3: A-D0,A-D4 + ; xm4: A-D1,A-D5 + ; xm5: A-D2,A-D6 + ; xm6: A-D3,A-D7 + punpcklwd m7, m3, m4 + punpcklwd m3, m11, m13 + punpcklwd m4, m5, m6 + punpcklwd m5, m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1 + ; xm3: A4-5,B4-5,C4-5,D4-5 + ; xm4: A2-3,B2-3,C2-3,D2-3 + ; xm5: A6-7,B6-7,C6-7,D6-7 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m8, m3, m5 + punpckhdq m5, m3, m5 + ; xm6: A0-3,B0-3 + ; xm7: C0-3,D0-3 + ; xm8: A4-7,B4-7 + ; xm5: C4-7,D4-7 + punpcklqdq m3, m6, m8 + punpckhqdq m4, m6, m8 + punpckhqdq m6, m7, m5 + punpcklqdq m5, m7, m5 + ; xm3: A0-7 + ; xm4: B0-7 + ; xm5: C0-7 + ; xm6: D0-7 +%elif %1 == 6 || %1 == 8 + movu xm3, [dstq+strideq*0-8] + movu xm4, [dstq+strideq*1-8] + movu xm5, [dstq+strideq*2-8] + movu xm6, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu xm11, [tmpq+strideq*0-8] + movu xm13, [tmpq+strideq*1-8] + movu xm14, [tmpq+strideq*2-8] + movu xm15, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + vinserti128 m3, [tmpq+strideq*0-8], 1 + vinserti128 m4, [tmpq+strideq*1-8], 1 + vinserti128 m5, [tmpq+strideq*2-8], 1 + vinserti128 m6, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m11, [tmpq+strideq*0-8], 1 + vinserti128 m13, [tmpq+strideq*1-8], 1 + vinserti128 m14, [tmpq+strideq*2-8], 1 + vinserti128 m15, [tmpq+stride3q -8], 1 + + ; transpose 8x16 + ; xm3: A-H0,A-H8 + ; xm4: A-H1,A-H9 + ; xm5: A-H2,A-H10 + ; xm6: A-H3,A-H11 + ; xm11: A-H4,A-H12 + ; xm13: A-H5,A-H13 + ; xm14: A-H6,A-H14 + ; xm15: A-H7,A-H15 + punpcklwd m7, m3, m4 + punpckhwd m3, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m6, m11, m13 + punpckhwd m11, m13 + punpcklwd m13, m14, m15 + punpckhwd m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1 + ; xm3: E0-1,F0-1,G0-1,H0-1 + ; xm4: A2-3,B2-3,C2-3,D2-3 + ; xm5: E2-3,F2-3,G2-3,H2-3 + ; xm6: A4-5,B4-5,C4-5,D4-5 + ; xm11: E4-5,F4-5,G4-5,H4-5 + ; xm13: A6-7,B6-7,C6-7,D6-7 + ; xm14: E6-7,F6-7,G6-7,H6-7 + punpckldq m15, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m3, m5 + punpckhdq m8, m3, m5 + punpckldq m3, m6, m13 + punpckhdq m6, m13 + punpckldq m10, m11, m14 + punpckhdq m11, m14 + ; xm15: A0-3,B0-3 + ; xm7: C0-3,D0-3 + ; xm9: E0-3,F0-3 + ; xm8: G0-3,H0-3 + ; xm3: A4-7,B4-7 + ; xm6: C4-7,D4-7 + ; xm10: E4-7,F4-7 + ; xm11: G4-7,H4-7 +%if %1 != 6 + punpcklqdq m0, m15, m3 +%endif + punpckhqdq m13, m15, m3 + punpcklqdq m3, m7, m6 + punpckhqdq m4, m7, m6 + punpcklqdq m5, m9, m10 + punpckhqdq m6, m9, m10 + punpcklqdq m14, m8, m11 +%if %1 != 6 + punpckhqdq m15, m8, m11 + mova [rsp+5*32], m0 +%endif +%else + ; We only use 14 pixels but we'll need the remainder at the end for + ; the second transpose + mova xm0, [dstq+strideq*0-16] + mova xm1, [dstq+strideq*1-16] + mova xm2, [dstq+strideq*2-16] + mova xm3, [dstq+stride3q -16] + lea tmpq, [dstq+strideq*4] + mova xm4, [tmpq+strideq*0-16] + mova xm5, [tmpq+strideq*1-16] + mova xm6, [tmpq+strideq*2-16] + mova xm7, [tmpq+stride3q -16] + lea tmpq, [tmpq+strideq*4] + vinserti128 m0, m0, [tmpq+strideq*0-16], 1 + vinserti128 m1, m1, [tmpq+strideq*1-16], 1 + vinserti128 m2, m2, [tmpq+strideq*2-16], 1 + vinserti128 m3, m3, [tmpq+stride3q -16], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m4, m4, [tmpq+strideq*0-16], 1 + vinserti128 m5, m5, [tmpq+strideq*1-16], 1 + vinserti128 m6, m6, [tmpq+strideq*2-16], 1 + vinserti128 m7, m7, [tmpq+stride3q -16], 1 + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + + mova [rsp+6*32], m0 + mova [rsp+7*32], m1 + mova [rsp+8*32], m2 + mova [rsp+9*32], m3 + mova [rsp+5*32], m4 + + mova xm0, [dstq+strideq*0] + mova xm1, [dstq+strideq*1] + mova xm2, [dstq+strideq*2] + mova xm3, [dstq+stride3q ] + lea tmpq, [dstq+strideq*4] + mova xm8, [tmpq+strideq*0] + mova xm9, [tmpq+strideq*1] + mova xm10, [tmpq+strideq*2] + mova xm11, [tmpq+stride3q ] + lea tmpq, [tmpq+strideq*4] + vinserti128 m0, m0, [tmpq+strideq*0], 1 + vinserti128 m1, m1, [tmpq+strideq*1], 1 + vinserti128 m2, m2, [tmpq+strideq*2], 1 + vinserti128 m3, m3, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m8, m8, [tmpq+strideq*0], 1 + vinserti128 m9, m9, [tmpq+strideq*1], 1 + vinserti128 m10, m10, [tmpq+strideq*2], 1 + vinserti128 m11, m11, [tmpq+stride3q ], 1 + + TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4 + + mova [rsp+10*32], m8 + mova [rsp+11*32], m9 + mova [rsp+12*32], m10 + mova [rsp+13*32], m11 + + ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15 + SWAP 13, 5, 0 + SWAP 3, 6, 1, 15 + SWAP 4, 7 + SWAP 2, 14 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v + pmovzxbw m1, [lq] + pmovzxbw m0, [lq+l_strideq] + pxor m2, m2 +%else + vpbroadcastq m0, [lq] ; l0, l1 + vpbroadcastq m1, [lq+l_strideq] ; l2, l3 + vpbroadcastq m2, [lq+l_strideq*2] ; l4, l5 + vpbroadcastq m10, [lq+l_stride3q] ; l6, l7 + punpckldq m0, m1 ; l0, l2, l1, l3 [2x] + punpckldq m2, m10 ; l4, l6, l5, l7 [2x] + vpblendd m0, m0, m2, 11110000b ; l0, l2, l1, l3, l4, l6, l5, l7 + pxor m2, m2 + punpcklbw m1, m0, m2 ; l0, l2, l4, l6 + punpckhbw m0, m2 ; l1, l3, l5, l7 +%endif + pcmpeqw m10, m2, m0 + pand m1, m10 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] + pcmpeqw m10, m2, m0 ; !L + psrlw m10, 1 + psrlw m2, m0, [lutq+128] + vpbroadcastw m1, [lutq+136] + pminuw m2, m1 + pmaxuw m2, [pw_1] ; I + psrlw m1, m0, 4 ; H + paddw m0, [pw_2] + vpbroadcastd m8, [r11] + paddw m0, m0 + paddw m0, m2 ; E + REPX {pmullw x, m8}, m0, m1, m2 + + psubw m8, m3, m4 ; p1-p0 + psubw m9, m5, m6 ; q1-q0 + REPX {pabsw x, x}, m8, m9 + pmaxuw m8, m10 + pmaxuw m8, m9 + pcmpgtw m7, m8, m1 ; hev +%if %1 != 4 + psubw m9, m13, m4 ; p2-p0 + pabsw m9, m9 + pmaxuw m9, m8 +%if %1 != 6 +%ifidn %2, v + mova m11, [tmpq+strideq*0] ; p3 +%else + mova m11, [rsp+5*32] ; p3 +%endif + psubw m10, m11, m4 ; p3-p0 + pabsw m10, m10 + pmaxuw m9, m10 +%endif + psubw m10, m5, m14 ; q2-q0 + pabsw m10, m10 + pmaxuw m9, m10 +%if %1 != 6 + psubw m10, m5, m15 ; q3-q0 + pabsw m10, m10 + pmaxuw m9, m10 +%endif + vpbroadcastd m10, [r11] + pcmpgtw m9, m10 ; !flat8in + + psubw m10, m13, m3 ; p2-p1 + pabsw m10, m10 +%if %1 != 6 + psubw m11, m13 ; p3-p2 + pabsw m11, m11 + pmaxuw m10, m11 + psubw m11, m14, m15 ; q3-q2 + pabsw m11, m11 + pmaxuw m10, m11 +%endif + psubw m11, m14, m6 ; q2-q1 + pabsw m11, m11 + pmaxuw m10, m11 + +%if %1 == 16 + vpbroadcastd m11, [maskq+8] + vpbroadcastd m1, [maskq+4] + por m11, m1 + pand m11, m12 + pcmpeqd m11, m12 + pand m10, m11 +%else + vpbroadcastd m11, [maskq+4] + pand m11, m12 + pcmpeqd m11, m12 + pand m10, m11 ; only apply fm-wide to wd>4 blocks +%endif + pmaxuw m8, m10 +%endif + pcmpgtw m8, m2 + + psubw m10, m3, m6 ; p1-q1 + psubw m11, m4, m5 ; p0-q0 + REPX {pabsw x, x}, m10, m11 + paddw m11, m11 + psrlw m10, 1 + paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m8, m10 + +%if %1 == 16 + +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] + mova m1, [tmpq+strideq*2] + mova m2, [tmpq+stride3q] +%else + mova m0, [rsp+7*32] + mova m1, [rsp+8*32] + mova m2, [rsp+9*32] +%endif + REPX {psubw x, m4}, m0, m1, m2 + REPX {pabsw x, x}, m0, m1, m2 + pmaxuw m1, m0 + pmaxuw m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] + mova m2, [tmpq+strideq*1] + mova m10, [tmpq+strideq*2] +%else + mova m0, [rsp+10*32] + mova m2, [rsp+11*32] + mova m10, [rsp+12*32] +%endif + REPX {psubw x, m5}, m0, m2, m10 + REPX {pabsw x, x}, m0, m2, m10 + pmaxuw m0, m2 + pmaxuw m1, m10 + pmaxuw m1, m0 + vpbroadcastd m0, [r11] + pcmpgtw m1, m0 ; !flat8out + por m1, m9 ; !flat8in | !flat8out + vpbroadcastd m2, [maskq+8] + pand m10, m2, m12 + pcmpeqd m10, m12 + pandn m1, m10 ; flat16 + pandn m1, m8, m1 ; flat16 & fm + + vpbroadcastd m10, [maskq+4] + por m10, m2 + pand m2, m10, m12 + pcmpeqd m2, m12 + pandn m9, m2 ; flat8in + pandn m9, m8, m9 + vpbroadcastd m2, [maskq+0] + por m2, m10 + pand m2, m12 + pcmpeqd m2, m12 + pandn m8, m2 + pandn m8, m9, m8 ; fm & !flat8 & !flat16 + pandn m9, m1, m9 ; flat8 & !flat16 +%elif %1 != 4 + vpbroadcastd m0, [maskq+4] + pand m2, m0, m12 + pcmpeqd m2, m12 + pandn m9, m2 + pandn m9, m8, m9 ; flat8 & fm + vpbroadcastd m2, [maskq+0] + por m0, m2 + pand m0, m12 + pcmpeqd m0, m12 + pandn m8, m0 + pandn m8, m9, m8 ; fm & !flat8 +%else + vpbroadcastd m0, [maskq+0] + pand m0, m12 + pcmpeqd m0, m12 + pandn m8, m0 ; fm +%endif + + ; short filter + vpbroadcastd m0, [r11+8*1] ; 511 or 2047 + vpbroadcastd m2, [r11+8*2] ; -512 or -2048 + psubw m10, m5, m4 + paddw m11, m10, m10 + paddw m11, m10 + psubw m10, m3, m6 ; iclip_diff(p1-q1) + pminsw m10, m0 + pmaxsw m10, m2 + pand m10, m7 ; f=iclip_diff(p1-q1)&hev + paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f) + pminsw m10, m0 + pmaxsw m10, m2 + pand m8, m10 ; f&=fm + vpbroadcastd m10, [pw_4] + paddw m10, m8 + paddw m8, [pw_3] + REPX {pminsw x, m0}, m10, m8 + psraw m10, 3 ; f2 + psraw m8, 3 ; f1 + psubw m5, m10 + paddw m4, m8 + + paddw m10, [pw_1] + psraw m10, 1 ; f=(f1+1)>>1 + pandn m8, m7, m10 ; f&=!hev + paddw m3, m8 + psubw m6, m8 + pxor m8, m8 + psubw m0, m2 ; 1023 or 4095 + REPX {pminsw x, m0}, m3, m4, m5, m6 + REPX {pmaxsw x, m8}, m3, m4, m5, m6 + +%if %1 == 16 + +; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16 +; m12=filter bits mask +; m13-15=p2/q2/q3 +; m0,2,7-8,10-11 = free + + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 + mova m11, [tmpq+strideq*4] ; p3 +%else + mova m0, [rsp+7*32] + mova m2, [rsp+8*32] + mova m7, [rsp+9*32] + mova m11, [rsp+5*32] +%endif + + mova [rsp+ 0*32], m9 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + paddw m8, m0, [pw_1] + psllw m8, 3 ; p6*8+8 + paddw m10, m2, m7 ; p5+p4 + psubw m8, m0 + paddw m10, m10 ; (p5+p4)*2 + paddw m8, m11 ; p6*7+p3 + paddw m10, m13 ; (p5+p4)*2+p2 + paddw m8, m3 ; p6*7+p3+p1 + paddw m10, m4 ; (p5+p4)*2+p2+p0 + paddw m8, m5 ; p6*7+p3+p1+q0 + paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psrlw m10, m8, 4 + vpblendvb m10, m2, m10, m1 +%ifidn %2, v + mova [tmpq+strideq*2], m10 ; p5 +%else + mova [rsp+8*32], m10 +%endif + + ; sub p6*2, add p3/q1 + paddw m8, m11 + paddw m10, m0, m0 + paddw m8, m6 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m7, m10, m1 +%ifidn %2, v + mova [tmpq+stride3q], m10 ; p4 +%else + mova [rsp+9*32], m10 +%endif + + ; sub p6/p5, add p2/q2 + psubw m8, m0 + paddw m10, m13, m14 + psubw m8, m2 + paddw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m11, m10, m1 +%ifidn %2, v + mova [tmpq+strideq*4], m10 ; p3 + lea tmpq, [dstq+strideq*4] +%else + mova [rsp+5*32], m10 +%endif + + ; sub p6/p4, add p1/q3 + paddw m8, m3 + paddw m10, m0, m7 + paddw m8, m15 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m13, m10, m1 + mova [rsp+1*32], m10 ; don't clobber p2/m13 + + ; sub p6/p3, add p0/q4 + paddw m8, m4 + paddw m10, m0, m11 +%ifidn %2, v + paddw m8, [tmpq+strideq*0] +%else + paddw m8, [rsp+10*32] +%endif + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m3, m10, m1 + mova [rsp+2*32], m10 ; don't clobber p1/m3 + + ; sub p6/p2, add q0/q5 + paddw m8, m5 + paddw m10, m0, m13 +%ifidn %2, v + paddw m8, [tmpq+strideq*1] +%else + paddw m8, [rsp+11*32] +%endif + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m4, m10, m1 + mova [rsp+3*32], m10 ; don't clobber p0/m4 + + ; sub p6/p1, add q1/q6 + paddw m8, m6 + paddw m10, m0, m3 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else + mova m0, [rsp+12*32] ; q6 +%endif + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m5, m10, m1 + mova [rsp+4*32], m10 ; don't clobber q0/m5 + + ; sub p5/p0, add q2/q6 + paddw m8, m14 + paddw m10, m2, m4 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6 + + ; sub p4/q0, add q3/q6 + paddw m8, m15 + paddw m10, m7, m5 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14 + + ; sub p3/q1, add q4/q6 +%ifidn %2, v + paddw m8, [tmpq+strideq*0] +%else + paddw m8, [rsp+10*32] +%endif + paddw m10, m11, m6 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 + vpblendvb m10, m15, m10, m1 +%ifidn %2, v + mova [tmpq+mstrideq], m10 ; q3 +%else + mova [rsp+14*32], m10 +%endif + + ; sub p2/q2, add q5/q6 +%ifidn %2, v + paddw m8, [tmpq+strideq*1] +%else + paddw m8, [rsp+11*32] +%endif + paddw m10, m13, m14 + paddw m8, m0 + psubw m8, m10 + psrlw m10, m8, 4 +%ifidn %2, v + mova m9, [tmpq+strideq*0] +%else + mova m9, [rsp+10*32] +%endif + vpblendvb m10, m9, m10, m1 +%ifidn %2, v + mova [tmpq+strideq*0], m10 ; q4 +%else + mova [rsp+10*32], m10 +%endif + + ; sub p1/q3, add q6*2 + psubw m8, m3 + paddw m0, m0 + psubw m8, m15 + paddw m8, m0 + psrlw m10, m8, 4 +%ifidn %2, v + mova m9, [tmpq+strideq*1] +%else + mova m9, [rsp+11*32] +%endif + vpblendvb m10, m9, m10, m1 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; q5 +%else + mova [rsp+11*32], m10 +%endif + + mova m9, [rsp+0*32] + mova m13, [rsp+1*32] + mova m3, [rsp+2*32] + mova m4, [rsp+3*32] + mova m5, [rsp+4*32] + SWAP 2, 6 + SWAP 7, 14 +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%else + mova m15, [rsp+14*32] +%endif +%endif + +%if %1 >= 8 + ; flat8 filter + vpbroadcastd m7, [pw_4096] +%ifidn %2, v + mova m0, [tmpq+strideq*0] ; p3 +%else + mova m0, [rsp+5*32] ; p3 +%endif + paddw m1, m0, m13 ; p3+p2 + paddw m2, m3, m4 ; p1+p0 + paddw m8, m1, m1 ; 2*(p3+p2) + paddw m2, m0 ; p1+p0+p3 + paddw m8, m5 ; 2*(p3+p2)+q0 + paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 + pmulhrsw m10, m2, m7 + + paddw m8, m3, m6 + psubw m2, m1 + paddw m2, m8 + pmulhrsw m8, m2, m7 + + paddw m11, m0, m3 + paddw m1, m4, m14 + psubw m2, m11 + paddw m2, m1 + pmulhrsw m1, m2, m7 + + paddw m11, m0, m4 + pblendvb m4, m1, m9 + paddw m1, m5, m15 + psubw m2, m11 + paddw m2, m1 + pmulhrsw m11, m2, m7 + + paddw m2, m6 + paddw m2, m15 + paddw m1, m13, m5 + pblendvb m5, m11, m9 + pblendvb m13, m10, m9 + psubw m2, m1 + pmulhrsw m1, m2, m7 + + psubw m2, m3 + pblendvb m3, m8, m9 + psubw m2, m6 + pblendvb m6, m1, m9 + paddw m1, m15, m14 + paddw m2, m1 + pmulhrsw m2, m7 + + pblendvb m14, m2, m9 + +%ifidn %2, v + mova [tmpq+strideq*1], m13 ; p2 + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m4 ; p0 + mova [dstq+strideq*0], m5 ; q0 + mova [dstq+strideq*1], m6 ; q1 + mova [dstq+strideq*2], m14 ; q2 +%elif %1 == 8 + TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 + + ; write 8x16 + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm13 + movu [dstq+strideq*2-8], xm3 + movu [dstq+stride3q -8], xm4 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm5 + movu [dstq+strideq*1-8], xm6 + movu [dstq+strideq*2-8], xm14 + movu [dstq+stride3q -8], xm15 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m0, 1 + vextracti128 [dstq+strideq*1-8], m13, 1 + vextracti128 [dstq+strideq*2-8], m3, 1 + vextracti128 [dstq+stride3q -8], m4, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m5, 1 + vextracti128 [dstq+strideq*1-8], m6, 1 + vextracti128 [dstq+strideq*2-8], m14, 1 + vextracti128 [dstq+stride3q -8], m15, 1 + lea dstq, [dstq+strideq*4] +%else + mova m8, [rsp+6*32] + mova m1, [rsp+7*32] + mova m2, [rsp+8*32] + mova m7, [rsp+9*32] + TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9 + + mova [dstq+strideq*0-16], xm8 + mova [dstq+strideq*1-16], xm1 + mova [dstq+strideq*2-16], xm2 + mova [dstq+stride3q -16], xm7 + lea tmpq, [dstq+strideq*4] + mova [tmpq+strideq*0-16], xm0 + mova [tmpq+strideq*1-16], xm13 + mova [tmpq+strideq*2-16], xm3 + mova [tmpq+stride3q -16], xm4 + lea tmpq, [tmpq+strideq*4] + vextracti128 [tmpq+strideq*0-16], m8, 1 + vextracti128 [tmpq+strideq*1-16], m1, 1 + vextracti128 [tmpq+strideq*2-16], m2, 1 + vextracti128 [tmpq+stride3q -16], m7, 1 + lea tmpq, [tmpq+strideq*4] + vextracti128 [tmpq+strideq*0-16], m0, 1 + vextracti128 [tmpq+strideq*1-16], m13, 1 + vextracti128 [tmpq+strideq*2-16], m3, 1 + vextracti128 [tmpq+stride3q -16], m4, 1 + + mova m0, [rsp+10*32] + mova m1, [rsp+11*32] + mova m2, [rsp+12*32] + mova m3, [rsp+13*32] + TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4 + mova [dstq+strideq*0], xm5 + mova [dstq+strideq*1], xm6 + mova [dstq+strideq*2], xm14 + mova [dstq+stride3q ], xm15 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + mova [dstq+strideq*2], xm2 + mova [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0], m5, 1 + vextracti128 [dstq+strideq*1], m6, 1 + vextracti128 [dstq+strideq*2], m14, 1 + vextracti128 [dstq+stride3q ], m15, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0], m0, 1 + vextracti128 [dstq+strideq*1], m1, 1 + vextracti128 [dstq+strideq*2], m2, 1 + vextracti128 [dstq+stride3q ], m3, 1 + lea dstq, [dstq+strideq*4] +%endif +%elif %1 == 6 + ; flat6 filter + vpbroadcastd m7, [pw_4096] + paddw m8, m3, m4 + paddw m8, m13 ; p2+p1+p0 + paddw m11, m13, m5 + paddw m8, m8 + paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 + pmulhrsw m2, m8, m7 + + paddw m8, m5 + paddw m11, m13, m13 + paddw m8, m6 + psubw m8, m11 + pmulhrsw m10, m8, m7 + + paddw m8, m6 + paddw m11, m13, m3 + paddw m8, m14 + psubw m8, m11 + pmulhrsw m11, m8, m7 + + psubw m8, m3 + paddw m14, m14 + psubw m8, m4 + paddw m8, m14 + pmulhrsw m8, m7 + + pblendvb m3, m2, m9 + pblendvb m4, m10, m9 + pblendvb m5, m11, m9 + pblendvb m6, m8, m9 + +%ifidn %2, v + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m4 ; p0 + mova [dstq+strideq*0], m5 ; q0 + mova [dstq+strideq*1], m6 ; q1 +%else + TRANSPOSE_8x4_AND_WRITE_4x16 +%endif +%else +%ifidn %2, v + mova [tmpq+strideq*0], m3 ; p1 + mova [tmpq+strideq*1], m4 ; p0 + mova [tmpq+strideq*2], m5 ; q0 + mova [tmpq+stride3q ], m6 ; q1 +%else + TRANSPOSE_8x4_AND_WRITE_4x16 +%endif +%endif +%endmacro + +INIT_YMM avx2 +cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + mov r6d, r7m + lea r11, [pw_4] + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 8, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + + call .v4 + +.end: + pslld m12, 4 + add lq, 16 + add dstq, 32 + shl mask_bitsd, 4 + sub wd, 4 + jg .loop + RET +ALIGN function_align +.v4: + FILTER 4, v + ret + +INIT_YMM avx2 +cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + mov r6d, r7m + lea r11, [pw_4] + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] + mov hd, hm + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 8, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + + call .h4 + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +.end: + pslld m12, 4 + lea lq, [lq+l_strideq*4] + shl mask_bitsd, 4 + sub hd, 4 + jg .loop + RET +ALIGN function_align +.h4: + FILTER 4, h + ret + +INIT_YMM avx2 +cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + mov r6d, r7m + lea r11, [pw_4] + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 6, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + + call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4 + +.end: + pslld m12, 4 + add lq, 16 + add dstq, 32 + shl mask_bitsd, 4 + sub wd, 4 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + mov r6d, r7m + lea r11, [pw_4] + shr r6d, 11 ; is_12bpc + lea r11, [r11+r6*4] + mov hd, hm + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + mov mask_bitsd, 0xf + mova m12, [pb_mask] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + + FILTER 6, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + + call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4 + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +.end: + pslld m12, 4 + lea lq, [lq+l_strideq*4] + shl mask_bitsd, 4 + sub hd, 4 + jg .loop + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/loopfilter16_avx512.asm b/third_party/dav1d/src/x86/loopfilter16_avx512.asm new file mode 100644 index 0000000000..b7bc3aa106 --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter16_avx512.asm @@ -0,0 +1,912 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +l_shuf_v: times 2 db 0, 32 +pw_1: times 2 dw 1 + times 2 db 4, 36 +pw_3: times 2 dw 3 + times 2 db 8, 40 +pw_4: times 2 dw 4 + times 2 db 12, 44 +pw_16: times 2 dw 16 + times 2 db 16, 48 +pw_4096: times 2 dw 4096 + times 2 db 20, 52 +pw_16384: times 2 dw 16384 + times 2 db 24, 56 +pw_32767: times 2 dw 32767 + times 2 db 28, 60 + times 2 dw 0 +filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128 +stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25 +l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1 +clip_max: dw 511, 511, 2047, 2047 +clip_min: dw -512, -512, -2048, -2048 + +SECTION .text + +%macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp + punpckhwd m%9, m%5, m%6 + punpcklwd m%5, m%6 + punpckhwd m%6, m%1, m%2 + punpcklwd m%1, m%2 + punpckhwd m%2, m%7, m%8 + punpcklwd m%7, m%8 + punpckhwd m%8, m%3, m%4 + punpcklwd m%3, m%4 + punpckhdq m%4, m%1, m%3 + punpckldq m%1, m%3 + punpckldq m%3, m%5, m%7 + punpckhdq m%5, m%7 + punpckhdq m%7, m%6, m%8 + punpckldq m%6, m%8 + punpckldq m%8, m%9, m%2 + punpckhdq m%9, m%2 + punpckhqdq m%2, m%1, m%3 + punpcklqdq m%1, m%3 + punpcklqdq m%3, m%4, m%5 + punpckhqdq m%4, m%5 + punpcklqdq m%5, m%6, m%8 + punpckhqdq m%6, m%8 + punpckhqdq m%8, m%7, m%9 + punpcklqdq m%7, m%9 +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] +%ifidn %2, v +%if %1 == 16 + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1 ] + mova m1, [tmpq+strideq*2 ] ; p5 + mova m2, [tmpq+stride3q ] ; p4 + mova m3, [tmpq+strideq*4 ] ; p3 + mova m4, [tmpq+stride5q ] ; p2 +%elif %1 == 6 || %1 == 8 + lea tmpq, [dstq+mstrideq*4] +%if %1 == 8 + mova m3, [tmpq+strideq*0 ] +%endif + mova m4, [tmpq+strideq*1 ] +%endif + mova m5, [dstq+mstrideq*2] ; p1 + mova m6, [dstq+mstrideq*1] ; p0 + mova m7, [dstq+strideq*0 ] ; q0 + mova m8, [dstq+strideq*1 ] ; q1 +%if %1 != 4 + mova m9, [dstq+strideq*2 ] ; q2 +%endif +%if %1 == 8 || %1 == 16 + mova m10, [dstq+stride3q ] ; q3 +%endif +%if %1 == 16 + mova m11, [dstq+strideq*4 ] ; q4 + mova m22, [dstq+stride5q ] ; q5 + mova m23, [dstq+stride3q*2] +%endif +%else ; h +%if %1 == 16 + movu ym16, [dstq+strideq*0 -16] + movu ym17, [dstq+strideq*1 -16] + movu ym18, [dstq+strideq*2 -16] + movu ym19, [dstq+stride3q -16] + movu ym20, [dstq+strideq*4 -16] + movu ym22, [dstq+stride5q -16] + movu ym23, [dstq+stride3q*2-16] + movu ym28, [dstq+stride7q -16] + lea tmpq, [dstq+strideq*8 -16] + vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1 + vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1 + vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1 + vinserti32x8 m10, m19, [tmpq+stride3q ], 1 + vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1 + vinserti32x8 m22, m22, [tmpq+stride5q ], 1 + vinserti32x8 m23, m23, [tmpq+stride3q*2], 1 + vinserti32x8 m28, m28, [tmpq+stride7q ], 1 + lea tmpq, [tmpq+strideq*8] + TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27 + movu ym16, [tmpq+strideq*0 ] + movu ym17, [tmpq+strideq*1 ] + movu ym18, [tmpq+strideq*2 ] + movu ym19, [tmpq+stride3q ] + movu ym24, [tmpq+strideq*4 ] + movu ym25, [tmpq+stride5q ] + movu ym26, [tmpq+stride3q*2] + movu ym20, [tmpq+stride7q ] + lea tmpq, [tmpq+strideq*8] + vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1 + vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1 + vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1 + vinserti32x8 m3, m19, [tmpq+stride3q ], 1 + vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1 + vinserti32x8 m5, m25, [tmpq+stride5q ], 1 + vinserti32x8 m6, m26, [tmpq+stride3q*2], 1 + vinserti32x8 m20, m20, [tmpq+stride7q ], 1 + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27 + vshufi32x4 m27, m7, m0, q2020 + vshufi32x4 m7, m0, q3131 + vshufi32x4 m0, m8, m1, q2020 + vshufi32x4 m8, m1, q3131 + vshufi32x4 m1, m9, m2, q2020 + vshufi32x4 m9, m2, q3131 + vshufi32x4 m2, m10, m3, q2020 + vshufi32x4 m10, m3, q3131 + vshufi32x4 m3, m11, m4, q2020 + vshufi32x4 m11, m4, q3131 + vshufi32x4 m4, m22, m5, q2020 + vshufi32x4 m22, m5, q3131 + vshufi32x4 m5, m23, m6, q2020 + vshufi32x4 m23, m6, q3131 + vshufi32x4 m6, m28, m20, q2020 + vshufi32x4 m28, m20, q3131 +%elif %1 == 6 || %1 == 8 +%if %1 == 8 + sub dstq, 8 + movu xm16, [dstq+strideq*0 ] + movu xm17, [dstq+strideq*1 ] + movu xm18, [dstq+strideq*2 ] + movu xm19, [dstq+stride3q ] + movu xm24, [dstq+strideq*4 ] + movu xm25, [dstq+stride5q ] + movu xm26, [dstq+stride3q*2] + movu xm27, [dstq+stride7q ] + lea tmpq, [dstq+strideq*8 ] + vinserti128 ym16, [tmpq+strideq*0 ], 1 + vinserti128 ym17, [tmpq+strideq*1 ], 1 + vinserti128 ym18, [tmpq+strideq*2 ], 1 + vinserti128 ym19, [tmpq+stride3q ], 1 + vinserti128 ym24, [tmpq+strideq*4 ], 1 + vinserti128 ym25, [tmpq+stride5q ], 1 + vinserti128 ym26, [tmpq+stride3q*2], 1 + vinserti128 ym27, [tmpq+stride7q ], 1 + lea tmpq, [tmpq+strideq*8 ] + vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2 + vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2 + vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2 + vinserti32x4 m7, m19, [tmpq+stride3q ], 2 + vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2 + vinserti32x4 m9, m25, [tmpq+stride5q ], 2 + vinserti32x4 m3, m26, [tmpq+stride3q*2], 2 + vinserti32x4 m4, m27, [tmpq+stride7q ], 2 + lea tmpq, [tmpq+strideq*8 ] + vinserti32x4 m10, [tmpq+strideq*0 ], 3 + vinserti32x4 m8, [tmpq+strideq*1 ], 3 + vinserti32x4 m5, [tmpq+strideq*2 ], 3 + vinserti32x4 m7, [tmpq+stride3q ], 3 + vinserti32x4 m2, [tmpq+strideq*4 ], 3 + vinserti32x4 m9, [tmpq+stride5q ], 3 + vinserti32x4 m3, [tmpq+stride3q*2], 3 + vinserti32x4 m4, [tmpq+stride7q ], 3 +%else ; %1 == 6 + movu xm16, [dstq+strideq*0-8] + movu xm17, [dstq+strideq*1-8] + movu xm18, [dstq+strideq*2-8] + movu xm19, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4-8] + movu xm2, [tmpq+strideq*0] + movu xm9, [tmpq+strideq*1] + movu xm3, [tmpq+strideq*2] + movu xm4, [tmpq+stride3q ] + lea tmpq, [tmpq+strideq*4] + vinserti128 ym16, [tmpq+strideq*0], 1 + vinserti128 ym17, [tmpq+strideq*1], 1 + vinserti128 ym18, [tmpq+strideq*2], 1 + vinserti128 ym19, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 ym2, [tmpq+strideq*0], 1 + vinserti128 ym9, [tmpq+strideq*1], 1 + vinserti128 ym3, [tmpq+strideq*2], 1 + vinserti128 ym4, [tmpq+stride3q ], 1 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m10, m16, [tmpq+strideq*0], 2 + vinserti32x4 m8, m17, [tmpq+strideq*1], 2 + vinserti32x4 m5, m18, [tmpq+strideq*2], 2 + vinserti32x4 m7, m19, [tmpq+stride3q ], 2 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m2, [tmpq+strideq*0], 2 + vinserti32x4 m9, [tmpq+strideq*1], 2 + vinserti32x4 m3, [tmpq+strideq*2], 2 + vinserti32x4 m4, [tmpq+stride3q ], 2 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m10, [tmpq+strideq*0], 3 + vinserti32x4 m8, [tmpq+strideq*1], 3 + vinserti32x4 m5, [tmpq+strideq*2], 3 + vinserti32x4 m7, [tmpq+stride3q ], 3 + lea tmpq, [tmpq+strideq*4] + vinserti32x4 m2, [tmpq+strideq*0], 3 + vinserti32x4 m9, [tmpq+strideq*1], 3 + vinserti32x4 m3, [tmpq+strideq*2], 3 + vinserti32x4 m4, [tmpq+stride3q ], 3 +%endif + punpcklwd m6, m10, m8 + punpckhwd m10, m8 + punpcklwd m8, m5, m7 + punpckhwd m5, m7 + punpcklwd m7, m2, m9 + punpckhwd m2, m9 + punpcklwd m9, m3, m4 + punpckhwd m3, m4 + punpckldq m4, m6, m8 + punpckhdq m6, m8 + punpckldq m8, m10, m5 + punpckhdq m10, m5 + punpckldq m5, m7, m9 + punpckhdq m7, m9 + punpckldq m9, m2, m3 + punpckhdq m2, m3 +%if %1 == 8 + punpcklqdq m3, m4, m5 +%endif + punpckhqdq m4, m5 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + punpcklqdq m7, m8, m9 + punpckhqdq m8, m9 + punpcklqdq m9, m10, m2 +%if %1 == 8 + punpckhqdq m10, m2 +%endif +%else ; %1 == 4 + kxnorb k1, k1, k1 + kmovb k2, k1 + vpgatherdq m7{k1}, [dstq+ym12-4] + lea tmpq, [dstq+strideq*2-4] + kmovb k1, k2 + vpgatherdq m4{k2}, [tmpq+ym12] + lea tmpq, [tmpq+strideq*2] + kmovb k2, k1 + vpgatherdq m5{k1}, [tmpq+ym12] + lea tmpq, [tmpq+strideq*2] + vpgatherdq m6{k2}, [tmpq+ym12] + punpcklwd m8, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpcklwd m6, m8, m7 + punpckhwd m8, m7 + punpcklwd m7, m4, m5 + punpckhwd m4, m5 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + punpcklqdq m7, m8, m4 + punpckhqdq m8, m4 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v + movu ym16, [lq+l_strideq*1] + movsldup m17, [l_shuf_v] + vptestnmb k1, ym16, ym16 + vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][] + vpermb m16, m17, m16 ; l[x][1] +%else + movq xm16, [lq+l_strideq*0] + movq xm17, [lq+l_strideq*1] + vinserti128 ym16, [lq+l_strideq*2], 1 + vinserti128 ym17, [lq+l_stride3q ], 1 + lea tmpq, [lq+l_strideq*4] + vinserti32x4 m16, [tmpq+l_strideq*0], 2 + vinserti32x4 m17, [tmpq+l_strideq*1], 2 + vinserti32x4 m16, [tmpq+l_strideq*2], 3 + vinserti32x4 m17, [tmpq+l_stride3q ], 3 + punpcklqdq m16, m17 + vbroadcasti32x4 m17, [l_shuf_h] + vptestnmb k1, m16, m16 + vpalignr m16{k1}, m16, 12 + pshufb m16, m17 ; l[x][1] +%endif + vpbroadcastd m20, [pw_32767] + psubw m17, m5, m6 ; p1-p0 + psubw m18, m7, m8 ; q1-q0 + vptestmw k1, m16, m16 ; L + pabsw m17, m17 + pabsw m18, m18 + vpmaxuw m20{k1}, m17, m18 + vpbroadcastw m17, [lutq+136] + psrlw m18, m16, [lutq+128] + vpbroadcastd m19, [pw_1] + pminuw m18, m17 + psrlw m17, m16, 4 ; H + paddw m16, m16 + pmaxuw m18, m19 ; I + vpaddd m16, [pw_4] {1to16} + paddw m16, m18 ; E + REPX {pmullw x, m13}, m17, m18, m16 + vpcmpw k4, m20, m17, 6 ; hev +%if %1 != 4 + psubw m19, m4, m5 ; p2-p1 + pabsw m19, m19 +%if %1 == 8 || %1 == 16 + psubw m17, m3, m4 ; p3-p2 + pabsw m17, m17 + pmaxuw m19, m17 + psubw m17, m9, m10 ; q3-q2 + pabsw m17, m17 + pmaxuw m19, m17 +%endif + psubw m17, m9, m8 ; q2-q1 + pabsw m17, m17 + pmaxuw m19, m17 +%if %1 == 16 + vpbroadcastd ym17, [maskq+4] + vpord ym17, [maskq+8] {1to8} + vptestmd k1, ym17, ym21 +%else + vptestmd k1, ym21, [maskq+4] {1to8} +%endif + pmaxuw m19, m20 + psubw m17, m4, m6 ; p2-p0 + pabsw m17, m17 + pmaxuw m17, m20 + vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks +%if %1 == 8 || %1 == 16 + psubw m19, m3, m6 ; p3-p0 + pabsw m19, m19 + pmaxuw m17, m19 + psubw m19, m7, m10 ; q3-q0 + pabsw m19, m19 + pmaxuw m17, m19 +%endif + psubw m19, m7, m9 ; q2-q0 + pabsw m19, m19 + pmaxuw m17, m19 +%endif + vpcmpw k1, m20, m18, 2 + psubw m18, m5, m8 ; p1-q1 + psubw m19, m6, m7 ; p0-q0 + pabsw m18, m18 + pabsw m19, m19 + psrlw m18, 1 + paddw m19, m19 + paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E +%if %1 != 4 + vpcmpw k2{k1}, m17, m13, 2 ; flat8in +%endif +%if %1 == 16 + psubw m20, m0, m6 + psubw m16, m1, m6 + pabsw m20, m20 + psubw m17, m2, m6 + pabsw m16, m16 + psubw m18, m11, m7 + pabsw m17, m17 + psubw m19, m22, m7 + pabsw m18, m18 + pmaxuw m20, m16 + psubw m16, m23, m7 + pabsw m19, m19 + pmaxuw m17, m18 + pabsw m16, m16 + vpandd ym18, ym21, [maskq+8] {1to8} + pmaxuw m20, m17 + pmaxuw m19, m16 + pcmpeqd ym16, ym21, ym18 + vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8 + pmaxuw m20, m19 + pcmpeqd ym17, ym21, ym18 + vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8 + vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out + pcmpeqd ym18, ym21 + vptestmb k3{k3}, ym16, ym16 ; flat8 & fm + vptestmb k2{k2}, ym17, ym17 ; flat8in + vptestmb k1{k1}, ym18, ym18 + kandnd k1, k2, k1 ; fm & !flat8 & !flat16 + kandnd k2, k3, k2 ; flat8 & !flat16 +%elif %1 == 6 || %1 == 8 + vpandd ym17, ym21, [maskq+4] {1to8} + pcmpeqd ym16, ym21, ym17 + vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8 + pcmpeqd ym17, ym21 + vptestmb k2{k2}, ym16, ym16 ; flat8 & fm + vptestmb k1{k1}, ym17, ym17 + kandnd k1, k2, k1 ; fm & !flat8 +%else ; %1 == 4 + vpandd ym16, ym21, [maskq+0] {1to8} + pcmpeqd ym16, ym21 + vptestmb k1{k1}, ym16, ym16 +%endif + + ; short filter + psubw m16, m7, m6 + vpbroadcastd m17, [pw_3] + paddw m18, m16, m16 + paddw m18, m16 + psubw m16, m5, m8 ; iclip_diff(p1-q1) + pminsw m16, m14 + vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev + knotd k4, k4 ; !hev + paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f) + vpbroadcastd m18, [pw_4] + pminsw m16, m14 + vpmaxsw m16{k1}{z}, m15 ; f&=fm + paddw m17, m16 + paddw m16, m18 + vpbroadcastd m18, [pw_16384] + pminsw m17, m14 + pminsw m16, m14 + psraw m17, 3 ; f2 + psraw m16, 3 ; f1 + paddw m6, m17 + psubw m7, m16 + vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev + psubw m17, m14, m15 ; 1023 or 4095 + pxor m18, m18 + paddw m5, m16 + psubw m8, m16 + REPX {pminsw x, m17}, m6, m7, m5, m8 + REPX {pmaxsw x, m18}, m6, m7, m5, m8 + +%if %1 == 16 ; flat16 filter + vpaddd m19, m0, [pw_1] {1to16} + paddw m16, m1, m2 ; p5+p4 + paddw m26, m1, m6 ; p5+p0 + paddw m24, m2, m7 ; p4+q0 + paddw m16, m4 ; p5+p4+p3 + paddw m17, m3, m5 ; p2+p1 + psllw m19, 3 + paddw m16, m26 ; p5*2+p4+p3+p0 + paddw m17, m24 ; p4+p2+p1+q0 + psubw m19, m0 ; p6*7+8 + paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0 + paddw m18, m3, m8 + paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0 + paddw m25, m1, m0 + paddw m16, m0, m0 + psrlw m1{k3}, m19, 4 + paddw m19, m18 + psubw m19, m16 ; +p3+q1-p6*2 + paddw m16, m2, m0 + psrlw m2{k3}, m19, 4 + psubw m19, m25 + paddw m25, m4, m9 + paddw m20, m10, m5 + paddw m19, m25 ; +p2+q2-p6-p5 + paddw m17, m0, m3 + psubw m16, m20, m16 + psrlw m3{k3}, m19, 4 + paddw m19, m16 ; +p1+q3-p6-p4 + paddw m16, m11, m6 + psubw m16, m17 + paddw m17, m0, m4 + psrlw m4{k3}, m19, 4 + paddw m19, m16 ; +p0+q4-p6-p3 + paddw m16, m22, m7 + psubw m16, m17 + paddw m17, m0, m5 + psrlw m5{k3}, m19, 4 + paddw m19, m16 ; +q0+q5-p6-p2 + paddw m16, m23, m8 + psrlw m6{k3}, m19, 4 + psubw m16, m17 + paddw m19, m16 ; +q1+q6-p6-p1 + paddw m16, m23, m9 + psrlw m7{k3}, m19, 4 + psubw m16, m26 + paddw m19, m16 ; +q2+q6-p5-p0 + paddw m16, m23, m10 + psrlw m8{k3}, m19, 4 + psubw m16, m24 + paddw m19, m16 ; +q3+q6-p4-p0 + paddw m16, m23, m11 + psrlw m9{k3}, m19, 4 + psubw m16, m18 + paddw m19, m16 ; +q4+q6-p3-q1 + paddw m16, m23, m22 + psrlw m10{k3}, m19, 4 + psubw m16, m25 + paddw m19, m16 ; +q5+q6-p2-q2 + paddw m16, m23, m23 + psrlw m11{k3}, m19, 4 + psubw m16, m20 + paddw m19, m16 ; +q6*2-p1-q3 + psrlw m22{k3}, m19, 4 +%endif +%if %1 == 8 || %1 == 16 ; flat8 filter + vpbroadcastd m20, [pw_4096] + paddw m16, m3, m4 ; p3+p2 + paddw m19, m5, m6 ; p1+p0 + paddw m17, m16, m16 ; 2*(p3+p2) + paddw m19, m3 ; p1+p0+p3 + paddw m17, m7 ; 2*(p3+p2)+q0 + paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0 + paddw m18, m4, m7 + pmulhrsw m4{k2}, m19, m20 + psubw m19, m16 + paddw m17, m5, m8 + paddw m16, m3, m5 + paddw m19, m17 + pmulhrsw m5{k2}, m19, m20 + psubw m19, m16 + paddw m16, m6, m9 + paddw m19, m16 + paddw m16, m3, m6 + pmulhrsw m6{k2}, m19, m20 + paddw m19, m10 + psubw m16, m7, m16 + paddw m19, m16 + psubw m16, m10, m18 + pmulhrsw m7{k2}, m19, m20 + paddw m16, m8 + paddw m19, m16 + psubw m16, m10, m17 + pmulhrsw m8{k2}, m19, m20 + paddw m16, m9 + paddw m19, m16 + pmulhrsw m9{k2}, m19, m20 +%elif %1 == 6 ; flat6 filter + vpbroadcastd m10, [pw_4096] + paddw m2, m5, m6 + paddw m0, m4, m7 + paddw m1, m2, m4 ; p2+p1+p0 + paddw m3, m4, m4 + paddw m1, m1 + paddw m4, m5 + paddw m1, m0 ; p2+2*(p2+p1+p0)+q0 + psubw m3, m7, m3 + pmulhrsw m5{k2}, m1, m10 + paddw m3, m8 + psubw m4, m8, m4 + paddw m1, m3 + pmulhrsw m6{k2}, m1, m10 + paddw m4, m9 + paddw m9, m9 + paddw m1, m4 + pmulhrsw m7{k2}, m1, m10 + psubw m9, m2 + paddw m1, m9 + pmulhrsw m8{k2}, m1, m10 +%endif + +%ifidn %2, v +%if %1 == 16 + mova [tmpq+strideq*2 ], m1 ; p5 + mova [tmpq+stride3q ], m2 ; p4 + mova [tmpq+strideq*4 ], m3 ; p3 + mova [tmpq+stride5q ], m4 ; p2 +%elif %1 == 8 + mova [tmpq+strideq*1 ], m4 ; p2 +%endif + mova [dstq+mstrideq*2], m5 ; p1 + mova [dstq+mstrideq ], m6 ; p0 + mova [dstq+strideq*0 ], m7 ; q0 + mova [dstq+strideq*1 ], m8 ; q1 +%if %1 == 8 || %1 == 16 + mova [dstq+strideq*2 ], m9 ; q2 +%endif +%if %1 == 16 + mova [dstq+stride3q ], m10 ; q3 + mova [dstq+strideq*4 ], m11 ; q4 + mova [dstq+stride5q ], m22 ; q5 +%endif +%else +%if %1 == 16 + TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20 + TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20 + mova [dstq+strideq*0 -16], xm27 + mova [dstq+strideq*0 ], xm7 + mova [dstq+strideq*1 -16], xm0 + mova [dstq+strideq*1 ], xm8 + mova [dstq+strideq*2 -16], xm1 + mova [dstq+strideq*2 ], xm9 + mova [dstq+stride3q -16], xm2 + mova [dstq+stride3q ], xm10 + mova [dstq+strideq*4 -16], xm3 + mova [dstq+strideq*4 ], xm11 + mova [dstq+stride5q -16], xm4 + mova [dstq+stride5q ], xm22 + mova [dstq+stride3q*2-16], xm5 + mova [dstq+stride3q*2 ], xm23 + mova [dstq+stride7q -16], xm6 + mova [dstq+stride7q ], xm28 + lea dstq, [dstq+strideq*8] + vextracti128 [dstq+strideq*0 -16], ym27, 1 + vextracti128 [dstq+strideq*0 ], ym7, 1 + vextracti128 [dstq+strideq*1 -16], ym0, 1 + vextracti128 [dstq+strideq*1 ], ym8, 1 + vextracti128 [dstq+strideq*2 -16], ym1, 1 + vextracti128 [dstq+strideq*2 ], ym9, 1 + vextracti128 [dstq+stride3q -16], ym2, 1 + vextracti128 [dstq+stride3q ], ym10, 1 + vextracti128 [dstq+strideq*4 -16], ym3, 1 + vextracti128 [dstq+strideq*4 ], ym11, 1 + vextracti128 [dstq+stride5q -16], ym4, 1 + vextracti128 [dstq+stride5q ], ym22, 1 + vextracti128 [dstq+stride3q*2-16], ym5, 1 + vextracti128 [dstq+stride3q*2 ], ym23, 1 + vextracti128 [dstq+stride7q -16], ym6, 1 + vextracti128 [dstq+stride7q ], ym28, 1 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 -16], m27, 2 + vextracti32x4 [dstq+strideq*0 ], m7, 2 + vextracti32x4 [dstq+strideq*1 -16], m0, 2 + vextracti32x4 [dstq+strideq*1 ], m8, 2 + vextracti32x4 [dstq+strideq*2 -16], m1, 2 + vextracti32x4 [dstq+strideq*2 ], m9, 2 + vextracti32x4 [dstq+stride3q -16], m2, 2 + vextracti32x4 [dstq+stride3q ], m10, 2 + vextracti32x4 [dstq+strideq*4 -16], m3, 2 + vextracti32x4 [dstq+strideq*4 ], m11, 2 + vextracti32x4 [dstq+stride5q -16], m4, 2 + vextracti32x4 [dstq+stride5q ], m22, 2 + vextracti32x4 [dstq+stride3q*2-16], m5, 2 + vextracti32x4 [dstq+stride3q*2 ], m23, 2 + vextracti32x4 [dstq+stride7q -16], m6, 2 + vextracti32x4 [dstq+stride7q ], m28, 2 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 -16], m27, 3 + vextracti32x4 [dstq+strideq*0 ], m7, 3 + vextracti32x4 [dstq+strideq*1 -16], m0, 3 + vextracti32x4 [dstq+strideq*1 ], m8, 3 + vextracti32x4 [dstq+strideq*2 -16], m1, 3 + vextracti32x4 [dstq+strideq*2 ], m9, 3 + vextracti32x4 [dstq+stride3q -16], m2, 3 + vextracti32x4 [dstq+stride3q ], m10, 3 + vextracti32x4 [dstq+strideq*4 -16], m3, 3 + vextracti32x4 [dstq+strideq*4 ], m11, 3 + vextracti32x4 [dstq+stride5q -16], m4, 3 + vextracti32x4 [dstq+stride5q ], m22, 3 + vextracti32x4 [dstq+stride3q*2-16], m5, 3 + vextracti32x4 [dstq+stride3q*2 ], m23, 3 + vextracti32x4 [dstq+stride7q -16], m6, 3 + vextracti32x4 [dstq+stride7q ], m28, 3 +%elif %1 == 8 + TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2 + movu [dstq+strideq*0 ], xm3 + movu [dstq+strideq*1 ], xm4 + movu [dstq+strideq*2 ], xm5 + movu [dstq+stride3q ], xm6 + movu [dstq+strideq*4 ], xm7 + movu [dstq+stride5q ], xm8 + movu [dstq+stride3q*2], xm9 + movu [dstq+stride7q ], xm10 + lea dstq, [dstq+strideq*8] + vextracti128 [dstq+strideq*0 ], ym3, 1 + vextracti128 [dstq+strideq*1 ], ym4, 1 + vextracti128 [dstq+strideq*2 ], ym5, 1 + vextracti128 [dstq+stride3q ], ym6, 1 + vextracti128 [dstq+strideq*4 ], ym7, 1 + vextracti128 [dstq+stride5q ], ym8, 1 + vextracti128 [dstq+stride3q*2], ym9, 1 + vextracti128 [dstq+stride7q ], ym10, 1 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 ], m3, 2 + vextracti32x4 [dstq+strideq*1 ], m4, 2 + vextracti32x4 [dstq+strideq*2 ], m5, 2 + vextracti32x4 [dstq+stride3q ], m6, 2 + vextracti32x4 [dstq+strideq*4 ], m7, 2 + vextracti32x4 [dstq+stride5q ], m8, 2 + vextracti32x4 [dstq+stride3q*2], m9, 2 + vextracti32x4 [dstq+stride7q ], m10, 2 + lea dstq, [dstq+strideq*8] + vextracti32x4 [dstq+strideq*0 ], m3, 3 + vextracti32x4 [dstq+strideq*1 ], m4, 3 + vextracti32x4 [dstq+strideq*2 ], m5, 3 + vextracti32x4 [dstq+stride3q ], m6, 3 + vextracti32x4 [dstq+strideq*4 ], m7, 3 + vextracti32x4 [dstq+stride5q ], m8, 3 + vextracti32x4 [dstq+stride3q*2], m9, 3 + vextracti32x4 [dstq+stride7q ], m10, 3 + lea dstq, [dstq+strideq*8+8] +%else ; %1 == 4 || %1 == 6 + punpcklwd m9, m5, m6 + punpckhwd m5, m6 + kxnorb k1, k1, k1 + punpcklwd m6, m7, m8 + punpckhwd m7, m8 + kmovb k2, k1 + punpckldq m8, m9, m6 + vpscatterdq [dstq+ym12-4]{k1}, m8 + punpckhdq m9, m6 + lea tmpq, [dstq+strideq*2-4] + kmovb k1, k2 + vpscatterdq [tmpq+ym12]{k2}, m9 + punpckldq m6, m5, m7 + lea tmpq, [tmpq+strideq*2] + kmovb k2, k1 + vpscatterdq [tmpq+ym12]{k1}, m6 + punpckhdq m5, m7 + lea tmpq, [tmpq+strideq*2] + vpscatterdq [tmpq+ym12]{k2}, m5 +%endif +%endif +%endmacro + +INIT_ZMM avx512icl +cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \ + lut, w, stride3, mstride, tmp, \ + mask_bits, stride5 +%define base tmpq-filter_mask + SWAP 12, 26 ; avoids clobbering xmm10 on WIN64 + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + lea stride3q, [strideq*3] + shl l_strideq, 2 + lea stride5q, [strideq*5] + shr r6d, 11 ; is_12bpc + mova ym21, [base+filter_mask] + mov mstrideq, strideq + vpbroadcastd m13, [base+pw_4+r6*8] + mov mask_bitsd, 0xff + vpbroadcastd m14, [base+clip_max+r6*4] + sub lq, l_strideq + vpbroadcastd m15, [base+clip_min+r6*4] + neg mstrideq + mov wd, wm +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + FILTER 16, v + jmp .end +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 8, v + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call .v4 +.end: + shl mask_bitsd, 8 + add dstq, 64 + pslld ym21, 8 + add lq, 32 + sub wd, 8 + jg .loop + RET +ALIGN function_align +.v4: ; called by both luma and chroma + FILTER 4, v + ret + +cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \ + lut, h, stride3, l_stride3, tmp, \ + mask_bits, stride5, stride7 + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + lea stride3q, [strideq*3] + vpbroadcastd ym12, strided + shl l_strideq, 2 + lea stride5q, [strideq*5] + shr r6d, 11 ; is_12bpc + pmulld ym12, [base+stride_mul] + lea stride7q, [strideq+stride3q*2] + mova ym21, [base+filter_mask] + mov mask_bitsd, 0xff + vpbroadcastd m13, [base+pw_4+r6*8] + sub lq, 4 + vpbroadcastd m14, [base+clip_max+r6*4] + lea l_stride3q, [l_strideq*3] + vpbroadcastd m15, [base+clip_min+r6*4] + mov hd, hm +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + jz .no_flat16 + FILTER 16, h + jmp .end +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 8, h + jmp .end2 +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .no_filter + call .h4 +.no_filter: + lea dstq, [dstq+stride3q*8] +.end: + lea dstq, [dstq+strideq*8] +.end2: + shl mask_bitsd, 8 + pslld ym21, 8 + lea lq, [lq+l_strideq*8] + sub hd, 8 + jg .loop + RET +ALIGN function_align +.h4: ; called by both luma and chroma + FILTER 4, h + ret + +cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + shl l_strideq, 2 + lea stride3q, [strideq*3] + shr r6d, 11 ; is_12bpc + mova ym21, [base+filter_mask] + mov mstrideq, strideq + vpbroadcastd m13, [base+pw_4+r6*8] + mov mask_bitsd, 0xff + vpbroadcastd m14, [base+clip_max+r6*4] + sub lq, l_strideq + vpbroadcastd m15, [base+clip_min+r6*4] + neg mstrideq + mov wd, wm +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 6, v + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4 +.end: + shl mask_bitsd, 8 + add dstq, 64 + pslld ym21, 8 + add lq, 32 + sub wd, 8 + jg .loop + RET + +cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits + lea tmpq, [filter_mask] + mov r6d, r7m ; bitdepth_max + vpbroadcastd ym12, strided + shl l_strideq, 2 + shr r6d, 11 ; is_12bpc + pmulld ym12, [base+stride_mul] + lea stride3q, [strideq*3] + mova ym21, [base+filter_mask] + mov mask_bitsd, 0xff + vpbroadcastd m13, [base+pw_4+r6*8] + sub lq, 4 + vpbroadcastd m14, [base+clip_max+r6*4] + lea l_stride3q, [l_strideq*3] + vpbroadcastd m15, [base+clip_min+r6*4] + mov hd, hm +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + jz .no_flat + FILTER 6, h + jmp .end +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + jz .end + call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4 +.end: + lea tmpq, [strideq+stride3q] + shl mask_bitsd, 8 + pslld ym21, 8 + lea dstq, [dstq+tmpq*8] + lea lq, [lq+l_strideq*8] + sub hd, 8 + jg .loop + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/loopfilter16_sse.asm b/third_party/dav1d/src/x86/loopfilter16_sse.asm new file mode 100644 index 0000000000..c486b57a21 --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter16_sse.asm @@ -0,0 +1,1793 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +%if ARCH_X86_64 +%define PIC_sym(a) a +%else +%define PIC_base $$ +%define PIC_sym(a) pic_regq+a-PIC_base +%endif + +pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 + times 4 db 8, 9 + +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 +pw_3: times 8 dw 3 +; 4 and 16 need to be next to each other since they are used as alternates +; depending on whether bitdepth is 10 or 12 +pw_4: times 8 dw 4 +pw_16: times 8 dw 16 +pw_8: times 8 dw 8 +pw_4096: times 8 dw 4096 + +pb_mask: dd 1, 1, 2, 2 + +SECTION .text + +%if ARCH_X86_32 +%if STACK_ALIGNMENT < 16 +%define extra_stack 2 +%else +%define extra_stack 0 +%endif +%endif + +%macro RELOC_ARGS 2 ; h/v, off +ASSERT ARCH_X86_32 +%if STACK_ALIGNMENT < 16 + mov r5d, [rstk + stack_offset + 4*4 + 4] +%define lstridem [esp+%2+0*gprsize] + mov lstridem, r5d + mov r5d, [rstk + stack_offset + 4*5 + 4] +%define lutm [esp+%2+1*gprsize] + mov lutm, r5d + mov r5d, [rstk + stack_offset + 4*6 + 4] +%ifidn %1, v +%define wm [esp+%2+2*gprsize] + mov wm, r5d + mov r5d, [rstk + stack_offset + 4*3 + 4] +%define lm [esp+%2+3*gprsize] + mov lm, r5d +%else ; %1 == h +%define hm [esp+%2+2*gprsize] + mov hm, r5d +%endif ; %1==v + mov r5d, r7m +%define bdmulm [esp+%2+4*gprsize] + mov bdmulm, r5d +%else +%define lstridem r4m +%define lutm r5m +%ifidn %1, v +%define wm r6m +%define lm r3m +%else +%define hm r6m +%endif +%define bdmulm r7m +%endif ; STACK_ALIGNMENT +%endmacro + +%macro UNRELOC_ARGS 0 +%if ARCH_X86_32 +%undef lm +%undef lstridem +%undef wm +%undef hm +%undef lutm +%endif +%endmacro + +%macro SPLATD 2 + movd %1, %2 + pshufd %1, %1, q0000 +%endmacro + +%macro SPLATW 2 + movd %1, %2 + pshuflw %1, %1, q0000 + punpcklqdq %1, %1 +%endmacro + +; in: out: +; mm%1 a b c d a e i m +; mm%2 e f g h b f j n +; mm%3 i j k l -> c g k o +; mm%4 m n o p d h l p +%macro TRANSPOSE4X4W 5 + punpcklwd m%5, m%1, m%2 + punpckhwd m%1, m%2 + punpcklwd m%2, m%3, m%4 + punpckhwd m%3, m%4 + punpckldq m%4, m%5, m%2 + punpckhdq m%5, m%2 + punpckldq m%2, m%1, m%3 + punpckhdq m%1, m%3 + + SWAP %1, %4 + SWAP %2, %5, %3 +%endmacro + +; in: out: +; m%1 a b c d e f g h a i q y 6 E M U +; m%2 i j k l m n o p b j r z 7 F N V +; m%3 q r s t u v w x c k s 0 8 G O W +; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X +; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y +; m%6 E F G H I J K L f n v 3 B J R Z +; m%7 M N O P Q R S T g o w 4 C K S + +; m%8 U V W X Y Z + = h p x 5 D L T = +%if ARCH_X86_64 +%macro TRANSPOSE8X8W 9 + ; m%1 a b c d e f g h a i q y b j r z + ; m%2 i j k l m n o p c k s 0 d l t 1 + ; m%3 q r s t u v w x -> e m u 2 f n v 3 + ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 + TRANSPOSE4X4W %1, %2, %3, %4, %9 + + ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V + ; m%6 E F G H I J K L 8 G O W 9 H P X + ; m%7 M N O P Q R S T -> A I Q Y B J R Z + ; m%8 U V W X Y Z + = C K S + D L T = + TRANSPOSE4X4W %5, %6, %7, %8, %9 + + ; m%1 a i q y b j r z a i q y 6 E M U + ; m%2 c k s 0 d l t 1 b j r z 7 F N V + ; m%3 e m u 2 f n v 3 c k s 0 8 G O W + ; m%4 g o w 4 h p x 5 d l t 1 9 H P X + ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y + ; m%6 8 G O W 9 H P X f n v 3 B J R Z + ; m%7 A I Q Y B J R Z g o w 4 C K S + + ; m%8 C K S + D L T = h p x 5 D L T = + punpckhqdq m%9, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + punpckhqdq m%7, m%4, m%8 + punpcklqdq m%4, m%8 + + SWAP %8, %7, %4, %5, %3, %2, %9 +%endmacro +%else ; x86-32 +; input: 1-7 in registers, 8 in first memory [read-only] +; second memory is scratch, and may overlap with first or third memory +; output: 1-5,7-8 in registers, 6 in third memory [write-only] +%macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x] + TRANSPOSE4X4W %1, %2, %3, %4, %8 +%ifnidn %9, "" + mov%12 m%8, %9 +%else + mova m%8, %10 +%endif + mova %10, m%4 + TRANSPOSE4X4W %5, %6, %7, %8, %4 + punpckhqdq m%4, m%1, m%5 + punpcklqdq m%1, m%5 + punpckhqdq m%5, m%2, m%6 + punpcklqdq m%2, m%6 + punpckhqdq m%6, m%3, m%7 + punpcklqdq m%3, m%7 + mova m%7, %10 +%ifnidn %11, "" + mov%13 %11, m%6 +%else + mova %10, m%6 +%endif + punpckhqdq m%6, m%7, m%8 + punpcklqdq m%7, m%8 + + ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8 + SWAP %2, %4, %5, %3 + SWAP %6, %8 +%endmacro +%endif ; x86-32/64 + +; transpose and write m8-11, everything else is scratch +%macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp + ; transpose 8x4 + punpcklwd %5, %1, %2 + punpckhwd %1, %2 + punpcklwd %2, %3, %4 + punpckhwd %3, %4 + punpckldq %4, %5, %2 + punpckhdq %5, %2 + punpckldq %2, %1, %3 + punpckhdq %1, %3 + + ; write out + movq [dstq+strideq*0-4], %4 + movhps [dstq+strideq*1-4], %4 + movq [dstq+strideq*2-4], %5 + movhps [dstq+stride3q -4], %5 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], %2 + movhps [dstq+strideq*1-4], %2 + movq [dstq+strideq*2-4], %1 + movhps [dstq+stride3q -4], %1 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 +%if ARCH_X86_64 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 + mova P1, [dstq+mstrideq*2] ; p1 + mova P0, [dstq+mstrideq*1] ; p0 + mova Q0, [dstq+strideq*0] ; q0 + mova Q1, [dstq+strideq*1] ; q1 +%else ; x86-32 +%define P1 [dstq+mstrideq*2] +%define P0 [dstq+mstrideq*1] +%define Q0 [dstq+strideq*0] +%define Q1 [dstq+strideq*1] +%endif ; x86-32/64 +%else ; %1 != 4 + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] +%if ARCH_X86_64 + ; we load p3 later +%define P2 m13 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 + mova P2, [tmpq+strideq*1] + mova P1, [tmpq+strideq*2] + mova P0, [tmpq+stride3q] + mova Q0, [dstq+strideq*0] + mova Q1, [dstq+strideq*1] + mova Q2, [dstq+strideq*2] +%if %1 != 6 +%define P3 [tmpq+strideq*0] +%define Q3 m15 + mova Q3, [dstq+stride3q] +%endif ; %1 != 6 +%else ; x86-32 +%define P2 [tmpq+strideq*1] +%define P1 [dstq+mstrideq*2] +%define P0 [dstq+mstrideq*1] +%define Q0 [dstq+strideq*0] +%define Q1 [dstq+strideq*1] +%define Q2 [dstq+strideq*2] +%if %1 != 6 +%define P3 [dstq+mstrideq*4] +%define Q3 [dstq+stride3q] +%endif ; %1 != 6 +%endif ; x86-32/64 +%endif ; %1 ==/!= 4 +%else ; %2 != v + ; load lines +%if %1 == 4 + movq m0, [dstq+strideq*0-4] + movq m2, [dstq+strideq*1-4] + movq m4, [dstq+strideq*2-4] + movq m5, [dstq+stride3q -4] + lea tmpq, [dstq+strideq*4] + movq m3, [tmpq+strideq*0-4] + movq m6, [tmpq+strideq*1-4] + movq m1, [tmpq+strideq*2-4] + movq m7, [tmpq+stride3q -4] + + ; transpose 4x8 + ; m0: A-D0 + ; m2: A-D1 + ; m4: A-D2 + ; m5: A-D3 + ; m3: A-D4 + ; m6: A-D5 + ; m1: A-D6 + ; m7: A-D7 + punpcklwd m0, m2 + punpcklwd m4, m5 + punpcklwd m3, m6 + punpcklwd m1, m7 + ; m0: A0-1,B0-1,C0-1,D0-1 + ; m4: A2-3,B2-3,C2-3,D2-3 + ; m3: A4-5,B4-5,C4-5,D4-5 + ; m1: A6-7,B6-7,C6-7,D6-7 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m4, m3, m1 + punpckldq m3, m1 + ; m0: A0-3,B0-3 + ; m2: C0-3,D0-3 + ; m3: A4-7,B4-7 + ; m4: C4-7,D4-7 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + punpckhqdq m3, m2, m4 + punpcklqdq m2, m4 + ; m0: A0-7 + ; m1: B0-7 + ; m2: C0-7 + ; m3: D0-7 +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 3, 11 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%else +%define P1 [esp+3*mmsize] +%define P0 [esp+4*mmsize] +%define Q0 [esp+5*mmsize] +%define Q1 [esp+6*mmsize] + mova P1, m0 + mova P0, m1 + mova Q0, m2 + mova Q1, m3 +%endif +%elif %1 == 6 || %1 == 8 + movu m0, [dstq+strideq*0-8] + movu m1, [dstq+strideq*1-8] + movu m2, [dstq+strideq*2-8] + movu m3, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu m4, [tmpq+strideq*0-8] + movu m5, [tmpq+strideq*1-8] + movu m6, [tmpq+strideq*2-8] +%if ARCH_X86_64 + movu m7, [tmpq+stride3q -8] +%endif + + ; transpose 8x16 + ; m0: A-H0,A-H8 + ; m1: A-H1,A-H9 + ; m2: A-H2,A-H10 + ; m3: A-H3,A-H11 + ; m4: A-H4,A-H12 + ; m5: A-H5,A-H13 + ; m6: A-H6,A-H14 + ; m7: A-H7,A-H15 +%if ARCH_X86_64 + punpcklwd m8, m0, m1 +%else + punpcklwd m7, m0, m1 +%endif + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 +%if ARCH_X86_64 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 +%else + mova [rsp+3*16], m4 + movu m4, [tmpq+stride3q -8] + punpcklwd m5, m6, m4 + punpckhwd m6, m4 +%endif + ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32] + ; m0: E0-1,F0-1,G0-1,H0-1 + ; m1: A2-3,B2-3,C2-3,D2-3 + ; m2: E2-3,F2-3,G2-3,H2-3 + ; m3: A4-5,B4-5,C4-5,D4-5 + ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32] + ; m5: A6-7,B6-7,C6-7,D6-7 + ; m6: E6-7,F6-7,G6-7,H6-7 +%if ARCH_X86_64 + punpckldq m7, m8, m1 + punpckhdq m8, m1 +%else + punpckldq m4, m7, m1 + punpckhdq m7, m1 +%endif + punpckldq m1, m0, m2 + punpckhdq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 +%if ARCH_X86_64 + punpckldq m5, m4, m6 + punpckhdq m4, m6 +%else + mova [rsp+4*16], m3 + mova m3, [rsp+3*16] + punpckldq m5, m3, m6 + punpckhdq m3, m6 +%endif + ; m7: A0-3,B0-3 [m4 on x86-32] + ; m8: C0-3,D0-3 [m7 on x86-32] + ; m1: E0-3,F0-3 + ; m0: G0-3,H0-3 + ; m2: A4-7,B4-7 + ; m3: C4-7,D4-7 [r4 on x86-32] + ; m5: E4-7,F4-7 + ; m4: G4-7,H4-7 [m3 on x86-32] +%if ARCH_X86_64 +%if %1 != 6 + punpcklqdq m6, m7, m2 +%endif + punpckhqdq m7, m2 + punpcklqdq m2, m8, m3 + punpckhqdq m8, m3 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 +%if %1 != 6 + punpckhqdq m5, m0, m4 +%endif + punpcklqdq m0, m4 +%if %1 == 8 + mova [rsp+1*16], m6 +%define P3 [rsp+1*16] +%endif + ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15 + SWAP 7, 13 + SWAP 8, 2, 9 + SWAP 3, 10 + SWAP 1, 11 + SWAP 0, 14 + SWAP 5, 15 +%define P2 m13 +%define P1 m8 +%define P0 m9 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 +%if %1 == 8 +%define Q3 m15 +%endif +%else ; x86-32 +%if %1 == 8 +%define P3 [rsp+ 6*16] + punpcklqdq m6, m4, m2 + mova P3, m6 +%endif + mova m6, [rsp+4*16] + punpckhqdq m4, m2 + punpcklqdq m2, m7, m6 + punpckhqdq m7, m6 + punpcklqdq m6, m1, m5 + punpckhqdq m1, m5 +%if %1 == 8 +%define Q3 [rsp+24*16] + punpckhqdq m5, m0, m3 + mova Q3, m5 +%endif + punpcklqdq m0, m3 +%if %1 == 8 +%define P2 [rsp+18*16] +%define P1 [rsp+19*16] +%define P0 [rsp+20*16] +%define Q0 [rsp+21*16] +%define Q1 [rsp+22*16] +%define Q2 [rsp+23*16] +%else +%define P2 [rsp+3*16] +%define P1 [rsp+4*16] +%define P0 [rsp+5*16] +%define Q0 [rsp+6*16] +%define Q1 [rsp+7*16] +%define Q2 [rsp+8*16] +%endif + mova P2, m4 + mova P1, m2 + mova P0, m7 + mova Q0, m6 + mova Q1, m1 + mova Q2, m0 +%endif ; x86-32/64 +%else ; %1 == 16 + ; We only use 14 pixels but we'll need the remainder at the end for + ; the second transpose + mova m0, [dstq+strideq*0-16] + mova m1, [dstq+strideq*1-16] + mova m2, [dstq+strideq*2-16] + mova m3, [dstq+stride3q -16] + lea tmpq, [dstq+strideq*4] + mova m4, [tmpq+strideq*0-16] + mova m5, [tmpq+strideq*1-16] + mova m6, [tmpq+strideq*2-16] +%if ARCH_X86_64 + mova m7, [tmpq+stride3q -16] + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 + SWAP 5, 13 + SWAP 6, 8 + SWAP 7, 9 +%define P2 m13 +%define P1 m8 +%define P0 m9 +%else ; x86-32 +%define P2 [esp+18*16] +%define P1 [esp+19*16] +%define P0 [esp+20*16] + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ + [tmpq+stride3q -16], P2, "", a, a + mova P1, m6 + mova P0, m7 +%endif ; x86-32/64 + mova [rsp+ 7*16], m0 + mova [rsp+ 8*16], m1 + mova [rsp+ 9*16], m2 + mova [rsp+10*16], m3 +%define P3 [rsp+6*16] + mova P3, m4 + + mova m0, [dstq+strideq*0] + mova m1, [dstq+strideq*1] + mova m2, [dstq+strideq*2] + mova m3, [dstq+stride3q ] + lea tmpq, [dstq+strideq*4] + mova m4, [tmpq+strideq*0] + mova m5, [tmpq+strideq*1] + mova m6, [tmpq+strideq*2] +%if ARCH_X86_64 + mova m7, [tmpq+stride3q ] + + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10 + SWAP 0, 10 + SWAP 1, 11 + SWAP 2, 14 + SWAP 3, 15 +%define Q0 m10 +%define Q1 m11 +%define Q2 m14 +%define Q3 m15 +%else ; x86-32 + TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ + [tmpq+stride3q ], [rsp+12*16], "", a, a +%define Q0 [esp+21*16] +%define Q1 [esp+22*16] +%define Q2 [esp+23*16] +%define Q3 [esp+24*16] + mova Q0, m0 + mova Q1, m1 + mova Q2, m2 + mova Q3, m3 +%endif ; x86-32/64 + + mova [rsp+11*16], m4 +%if ARCH_X86_64 + mova [rsp+12*16], m5 +%endif + mova [rsp+13*16], m6 + mova [rsp+14*16], m7 +%endif ; %1 == 4/6/8/16 +%endif ; %2 ==/!= v + + ; load L/E/I/H +%if ARCH_X86_32 +%define l_strideq r5 + mov l_strideq, dword lstridem +%ifidn %2, v +%define lq r3 + mov lq, dword lm +%endif +%endif +%ifidn %2, v +%if cpuflag(sse4) + pmovzxbw m1, [lq] + pmovzxbw m0, [lq+l_strideq] + pxor m2, m2 +%else ; ssse3 + movq m1, [lq] + movq m0, [lq+l_strideq] + pxor m2, m2 + REPX {punpcklbw x, m2}, m1, m0 +%endif ; ssse3/sse4 +%else ; %2 != v + movq m0, [lq] ; l0, l1 + movq m1, [lq+l_strideq] ; l2, l3 + punpckldq m0, m1 ; l0, l2, l1, l3 + pxor m2, m2 + punpcklbw m1, m0, m2 ; l0, l2 + punpckhbw m0, m2 ; l1, l3 +%endif ; %2==/!=v +%if ARCH_X86_32 +%ifidn %2, v +%undef lq + mov mstrideq, mstridem +%endif +%endif + pcmpeqw m5, m2, m0 + pand m1, m5 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1] + pcmpeqw m5, m2, m0 ; !L + psrlw m5, 1 +%if ARCH_X86_64 + psrlw m2, m0, [lutq+128] + SPLATW m1, [lutq+136] +%else ; x86-32 + mov r5, lutm + psrlw m2, m0, [r5+128] + SPLATW m1, [r5+136] +%endif ; x86-32/64 + pminsw m2, m1 + pmaxsw m2, [PIC_sym(pw_1)] ; I + psrlw m1, m0, 4 ; H + paddw m0, [PIC_sym(pw_2)] + paddw m0, m0 + paddw m0, m2 ; E + REPX {pmullw x, [bdmulq]}, m0, m1, m2 +%if ARCH_X86_32 +%undef l_strideq + lea stride3q, [strideq*3] +%endif + + psubw m3, P1, P0 ; p1-p0 + psubw m4, Q0, Q1 ; q0-q1 + REPX {pabsw x, x}, m3, m4 + pmaxsw m3, m5 + pmaxsw m3, m4 + pcmpgtw m7, m3, m1 ; hev +%if %1 != 4 + psubw m4, P2, P0 ; p2-p0 + pabsw m4, m4 + pmaxsw m4, m3 +%if %1 != 6 + mova m6, P3 ; p3 + psubw m5, m6, P0 ; p3-p0 + pabsw m5, m5 + pmaxsw m4, m5 +%endif ; %1 != 6 + psubw m5, Q0, Q2 ; q0-q2 + pabsw m5, m5 + pmaxsw m4, m5 +%if %1 != 6 + psubw m5, Q0, Q3 ; q0-q3 + pabsw m5, m5 + pmaxsw m4, m5 +%endif ; %1 != 6 + pcmpgtw m4, [bdmulq] ; !flat8in + + psubw m5, P2, P1 ; p2-p1 + pabsw m5, m5 +%if %1 != 6 + psubw m6, P2 ; p3-p2 + pabsw m6, m6 + pmaxsw m5, m6 + psubw m6, Q2, Q3 ; q2-q3 + pabsw m6, m6 + pmaxsw m5, m6 +%endif ; %1 != 6 + psubw m6, Q2, Q1 ; q2-q1 + pabsw m6, m6 + pmaxsw m5, m6 + +%if %1 == 16 + SPLATD m6, [maskq+8] + SPLATD m1, [maskq+4] + por m6, m1 + pand m6, m12 + pcmpeqd m6, m12 + pand m5, m6 +%else ; %1 != 16 + SPLATD m6, [maskq+4] + pand m6, m12 + pcmpeqd m6, m12 + pand m5, m6 ; only apply fm-wide to wd>4 blocks +%endif ; %1==/!=16 + pmaxsw m3, m5 +%endif ; %1 != 4 + pcmpgtw m3, m2 + + psubw m5, P1, Q1 ; p1-q1 + psubw m6, P0, Q0 ; p0-q0 + REPX {pabsw x, x}, m5, m6 + paddw m6, m6 + psrlw m5, 1 + paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m3, m5 + +%if %1 == 16 + +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] + mova m1, [tmpq+strideq*2] + mova m2, [tmpq+stride3q] +%else ; %2 != v + mova m0, [rsp+ 8*16] + mova m1, [rsp+ 9*16] + mova m2, [rsp+10*16] +%endif ; %2==/!=v + REPX {psubw x, P0}, m0, m1, m2 + REPX {pabsw x, x}, m0, m1, m2 + pmaxsw m1, m0 + pmaxsw m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] + mova m2, [tmpq+strideq*1] + mova m5, [tmpq+strideq*2] +%else ; %2 != v + mova m0, [rsp+11*16] + mova m2, [rsp+12*16] + mova m5, [rsp+13*16] +%endif ; %2==/!=v + REPX {psubw x, Q0}, m0, m2, m5 + REPX {pabsw x, x}, m0, m2, m5 + pmaxsw m0, m2 + pmaxsw m1, m5 + pmaxsw m1, m0 + pcmpgtw m1, [bdmulq] ; !flat8out + por m1, m4 ; !flat8in | !flat8out + SPLATD m2, [maskq+8] + pand m5, m2, m12 + pcmpeqd m5, m12 + pandn m1, m5 ; flat16 + pandn m5, m3, m1 ; flat16 & fm + SWAP 1, 5 + + SPLATD m5, [maskq+4] + por m5, m2 + pand m2, m5, m12 + pcmpeqd m2, m12 + pandn m4, m2 ; flat8in + pandn m2, m3, m4 + SWAP 2, 4 + SPLATD m2, [maskq+0] + por m2, m5 + pand m2, m12 + pcmpeqd m2, m12 + pandn m3, m2 + pandn m0, m4, m3 ; fm & !flat8 & !flat16 + SWAP 0, 3 + pandn m0, m1, m4 ; flat8 & !flat16 + SWAP 0, 4 +%elif %1 != 4 + SPLATD m0, [maskq+4] + pand m2, m0, m12 + pcmpeqd m2, m12 + pandn m4, m2 + pandn m2, m3, m4 ; flat8 & fm + SWAP 2, 4 + SPLATD m2, [maskq+0] + por m0, m2 + pand m0, m12 + pcmpeqd m0, m12 + pandn m3, m0 + pandn m0, m4, m3 ; fm & !flat8 + SWAP 0, 3 +%else ; %1 == 4 + SPLATD m0, [maskq+0] + pand m0, m12 + pcmpeqd m0, m12 + pandn m3, m0 ; fm +%endif ; %1==/!=4 + + ; short filter +%if ARCH_X86_64 + SPLATW m0, r7m +%else + SPLATW m0, bdmulm +%endif + pcmpeqw m2, m2 + psrlw m0, 1 ; 511 or 2047 + pxor m2, m0 ; -512 or -2048 + + psubw m5, Q0, P0 ; q0-p0 + paddw m6, m5, m5 + paddw m6, m5 ; 3*(q0-p0) + psubw m5, P1, Q1 ; iclip_diff(p1-q1) + pminsw m5, m0 + pmaxsw m5, m2 + pand m5, m7 ; f=iclip_diff(p1-q1)&hev + paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f) + pminsw m5, m0 + pmaxsw m5, m2 + pand m3, m5 ; f&=fm + paddw m5, m3, [PIC_sym(pw_3)] + paddw m3, [PIC_sym(pw_4)] + REPX {pminsw x, m0}, m5, m3 + psraw m5, 3 ; f2 + psraw m3, 3 ; f1 + psubw m0, m2 ; 1023 or 4095 + pxor m2, m2 +%if ARCH_X86_64 + paddw P0, m5 + psubw Q0, m3 +%else + paddw m5, P0 + psubw m6, Q0, m3 + REPX {pminsw x, m0}, m5, m6 + REPX {pmaxsw x, m2}, m5, m6 +%endif + + paddw m3, [PIC_sym(pw_1)] + psraw m3, 1 ; f=(f1+1)>>1 + pandn m7, m3 ; f&=!hev + SWAP 7, 3 +%if ARCH_X86_64 + paddw P1, m3 + psubw Q1, m3 + REPX {pminsw x, m0}, P1, P0, Q0, Q1 + REPX {pmaxsw x, m2}, P1, P0, Q0, Q1 +%else + psubw m7, Q1, m3 + paddw m3, P1 + REPX {pminsw x, m0}, m7, m3 + REPX {pmaxsw x, m2}, m7, m3 +%if %1 > 4 + mova P1, m3 + mova P0, m5 + mova Q0, m6 + mova Q1, m7 +%endif +%endif + +%if %1 == 16 + +; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16 +; m12=filter bits mask +; m13-15=p2/q2/q3 +; m0,2-3,5-7 = free + + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 + mova m6, [tmpq+strideq*4] ; p3 + lea tmpq, [dstq+mstrideq*4] +%else ; %2 != v + mova m0, [rsp+ 8*16] + mova m2, [rsp+ 9*16] + mova m7, [rsp+10*16] + mova m6, [rsp+ 6*16] +%endif ; %2==/!=v + + mova [rsp+ 0*16], m4 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psllw m3, m0, 3 ; p6*8 + paddw m3, [PIC_sym(pw_8)] + paddw m5, m2, m7 ; p5+p4 + psubw m3, m0 + paddw m5, m5 ; (p5+p4)*2 + paddw m3, m6 ; p6*7+p3 + paddw m5, P2 ; (p5+p4)*2+p2 + paddw m3, P1 ; p6*7+p3+p1 + paddw m5, P0 ; (p5+p4)*2+p2+p0 + paddw m3, Q0 ; p6*7+p3+p1+q0 + paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m2 + por m5, m4 +%ifidn %2, v + mova [tmpq+mstrideq*2], m5 ; p5 +%else ; %2 != v + mova [rsp+9*16], m5 +%endif ; %2==/!=v + + ; sub p6*2, add p3/q1 + paddw m3, m6 + paddw m5, m0, m0 + paddw m3, Q1 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m7 + por m5, m4 +%ifidn %2, v + mova [tmpq+mstrideq*1], m5 ; p4 +%else ; %2 != v + mova [rsp+10*16], m5 +%endif ; %2==/!=v + + ; sub p6/p5, add p2/q2 + psubw m3, m0 + paddw m5, P2, Q2 + psubw m3, m2 + paddw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, m6 + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*0], m5 ; p3 +%else ; %2 != v + mova [rsp+6*16], m5 +%endif ; %2==/!=v + +%define WRITE_IN_PLACE 0 +%ifidn %2, v +%if ARCH_X86_64 +%define WRITE_IN_PLACE 1 +%endif +%endif + + ; sub p6/p4, add p1/q3 + paddw m3, P1 + paddw m5, m0, m7 + paddw m3, Q3 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P2 + por m5, m4 +%if WRITE_IN_PLACE + mova [tmpq+strideq*1], m5 +%else + mova [rsp+1*16], m5 ; don't clobber p2/m13 +%endif + + ; sub p6/p3, add p0/q4 + paddw m3, P0 + paddw m5, m0, m6 +%ifidn %2, v + paddw m3, [dstq+strideq*4] +%else ; %2 != v + paddw m3, [rsp+11*16] +%endif ; %2==/!=v + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P1 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq+mstrideq*2], m5 +%else + mova [rsp+2*16], m5 ; don't clobber p1/m3 +%endif + + ; sub p6/p2, add q0/q5 + paddw m3, Q0 + paddw m5, m0, P2 +%ifidn %2, v +%if ARCH_X86_32 + lea r4, P2 +%endif + lea tmpq, [dstq+strideq*4] + paddw m3, [tmpq+strideq*1] +%else ; %2 != v + paddw m3, [rsp+12*16] +%endif ; %2==/!=v + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, P0 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq+mstrideq*1], m5 +%else + mova [rsp+3*16], m5 ; don't clobber p0/m4 +%endif + + ; sub p6/p1, add q1/q6 + paddw m3, Q1 + paddw m5, m0, P1 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else ; %2 != v + mova m0, [rsp+13*16] ; q6 +%endif ; %2==/!=v + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, Q0 + por m5, m4 +%if WRITE_IN_PLACE + mova [dstq], m5 +%else + mova [rsp+4*16], m5 ; don't clobber q0/m5 +%endif + + ; sub p5/p0, add q2/q6 + paddw m3, Q2 + paddw m5, m2, P0 + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 + pandn m4, m1, Q1 + por m2, m5, m4 ; don't clobber q1/m6 + + ; sub p4/q0, add q3/q6 + paddw m3, Q3 + paddw m7, Q0 + paddw m3, m0 + psubw m3, m7 + psrlw m7, m3, 4 + pand m7, m1 + pandn m4, m1, Q2 + por m7, m4 ; don't clobber q2/m14 + + ; sub p3/q1, add q4/q6 +%ifidn %2, v + paddw m3, [tmpq+strideq*0] +%else ; %2 != v + paddw m3, [rsp+11*16] +%endif ; %2==/!=v + paddw m6, Q1 + paddw m3, m0 + psubw m3, m6 + psrlw m6, m3, 4 + pand m6, m1 + pandn m4, m1, Q3 + por m6, m4 +%if WRITE_IN_PLACE + mova [tmpq+mstrideq], m6 ; q3 +%else ; %2 != v + mova [rsp+5*16], m6 +%endif ; %2==/!=v + + ; sub p2/q2, add q5/q6 +%ifidn %2, v + paddw m3, [tmpq+strideq*1] +%if ARCH_X86_64 + paddw m5, P2, Q2 +%else + ; because tmpq is clobbered, so we use a backup pointer for P2 instead + paddw m5, [r4], Q2 + mov pic_regq, pic_regm +%endif +%else ; %2 != v + paddw m3, [rsp+12*16] + paddw m5, P2, Q2 +%endif ; %2==/!=v + paddw m3, m0 + psubw m3, m5 + psrlw m5, m3, 4 + pand m5, m1 +%ifidn %2, v + pandn m4, m1, [tmpq+strideq*0] +%else ; %2 != v + pandn m4, m1, [rsp+11*16] +%endif ; %2==/!=v + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*0], m5 ; q4 +%else ; %2 != v + mova [rsp+11*16], m5 +%endif ; %2==/!=v + + ; sub p1/q3, add q6*2 + psubw m3, P1 + paddw m0, m0 + psubw m3, Q3 + paddw m3, m0 + psrlw m5, m3, 4 + pand m5, m1 +%ifidn %2, v + pandn m4, m1, [tmpq+strideq*1] +%else ; %2 != v + pandn m4, m1, [rsp+12*16] +%endif ; %2==/!=v + por m5, m4 +%ifidn %2, v + mova [tmpq+strideq*1], m5 ; q5 +%else ; %2 != v + mova [rsp+12*16], m5 +%endif ; %2==/!=v + + mova m4, [rsp+0*16] +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%endif +%if ARCH_X86_64 + SWAP 2, 11 + SWAP 7, 14 + SWAP 6, 15 +%else ; x86-32 + mova Q1, m2 + mova Q2, m7 +%endif ; x86-32/64 +%if WRITE_IN_PLACE + mova P2, [tmpq+strideq*1] + mova P1, [tmpq+strideq*2] + mova P0, [tmpq+stride3q] + mova Q0, [dstq] +%elif ARCH_X86_64 + mova P2, [rsp+1*16] + mova P1, [rsp+2*16] + mova P0, [rsp+3*16] + mova Q0, [rsp+4*16] +%else ; !WRITE_IN_PLACE & x86-32 + mova m0, [rsp+1*16] + mova m1, [rsp+2*16] + mova m2, [rsp+3*16] + mova m3, [rsp+4*16] + mova m7, [rsp+5*16] + mova P2, m0 + mova P1, m1 + mova P0, m2 + mova Q0, m3 + mova Q3, m7 +%endif ; WRITE_IN_PLACE / x86-32/64 +%undef WRITE_IN_PLACE +%endif ; %1 == 16 + +%if %1 >= 8 + + ; flat8 filter + mova m0, P3 ; p3 + paddw m1, m0, P2 ; p3+p2 + paddw m2, P1, P0 ; p1+p0 + paddw m3, m1, m1 ; 2*(p3+p2) + paddw m2, m0 ; p1+p0+p3 + paddw m3, Q0 ; 2*(p3+p2)+q0 + paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0 + pmulhrsw m7, m2, [PIC_sym(pw_4096)] + psubw m7, P2 + pand m7, m4 + + paddw m3, P1, Q1 ; p1+q1 + psubw m2, m1 ; 2*p3+p2+p1+p0+q0 + paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1 + pmulhrsw m3, m2, [PIC_sym(pw_4096)] + psubw m3, P1 + pand m3, m4 + + paddw m5, m0, P1 ; p3+p1 + paddw m6, P0, Q2 ; p0+q2 + psubw m2, m5 ; p3+p2+p1+p0+q0+q1 + paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2 + pmulhrsw m5, m2, [PIC_sym(pw_4096)] + psubw m5, P0 + pand m5, m4 + + paddw m6, m0, P0 ; p3+p0 + paddw m1, Q0, Q3 ; q0+q3 + psubw m2, m6 ; p2+p1+p0+q0+q1+q2 + paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3 + pmulhrsw m6, m2, [PIC_sym(pw_4096)] + psubw m6, Q0 + pand m6, m4 + + paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3 + paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3 + paddw m1, P2, Q0 ; p2+q0 + psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3 + pmulhrsw m1, m2, [PIC_sym(pw_4096)] + psubw m1, Q1 + pand m1, m4 + + psubw m2, P1 ; p0+q0+2*q1+q2+2*q3 + psubw m2, Q1 ; p0+q0+q1+q2+2*q3 + paddw m0, Q3, Q2 ; q3+q2 + paddw m2, m0 ; p0+q0+q1+2*q2+3*q3 + pmulhrsw m2, [PIC_sym(pw_4096)] + psubw m2, Q2 + pand m2, m4 + + paddw m7, P2 + paddw m3, P1 + paddw m5, P0 + paddw m6, Q0 + paddw m1, Q1 + paddw m2, Q2 + +%ifidn %2, v + mova [tmpq+strideq*1], m7 ; p2 + mova [tmpq+strideq*2], m3 ; p1 + mova [tmpq+stride3q ], m5 ; p0 + mova [dstq+strideq*0], m6 ; q0 + mova [dstq+strideq*1], m1 ; q1 + mova [dstq+strideq*2], m2 ; q2 +%else ; %2 != v + mova m0, P3 + +%if %1 == 8 + lea tmpq, [dstq+strideq*4] +%if ARCH_X86_64 + SWAP 4, 15 + TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8 +%else + TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \ + Q3, [tmpq+strideq*1-8], a, u +%endif + + ; write 8x8 + movu [dstq+strideq*0-8], m0 + movu [dstq+strideq*1-8], m7 + movu [dstq+strideq*2-8], m3 + movu [dstq+stride3q -8], m5 + movu [tmpq+strideq*0-8], m6 +%if ARCH_X86_64 + movu [tmpq+strideq*1-8], m1 +%endif + movu [tmpq+strideq*2-8], m2 + movu [tmpq+stride3q -8], m4 + lea dstq, [dstq+strideq*8] +%else ; %1 != 8 +%if ARCH_X86_64 + SWAP 6, 8 + SWAP 1, 9 + SWAP 2, 10 +%else + mova [rsp+1*16], m6 + mova [rsp+2*16], m1 + mova [rsp+3*16], m2 +%endif + + mova m1, [rsp+ 7*16] + mova m2, [rsp+ 8*16] + mova m4, [rsp+ 9*16] + mova m6, [rsp+10*16] + lea tmpq, [dstq+strideq*4] +%if ARCH_X86_64 + TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11 +%else + mova [rsp+7*16], m5 + TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \ + [rsp+7*16], [tmpq+strideq*1-16], a, a +%endif + + mova [dstq+strideq*0-16], m1 + mova [dstq+strideq*1-16], m2 + mova [dstq+strideq*2-16], m4 + mova [dstq+stride3q -16], m6 + mova [tmpq+strideq*0-16], m0 +%if ARCH_X86_64 + mova [tmpq+strideq*1-16], m7 +%endif + mova [tmpq+strideq*2-16], m3 + mova [tmpq+stride3q -16], m5 + +%if ARCH_X86_64 + SWAP 6, 8 + SWAP 1, 9 + SWAP 2, 10 + SWAP 4, 15 +%else + mova m6, [rsp+1*16] + mova m1, [rsp+2*16] + mova m2, [rsp+3*16] + mova m4, Q3 +%endif + mova m0, [rsp+11*16] + mova m3, [rsp+12*16] + mova m5, [rsp+13*16] +%if ARCH_X86_64 + mova m7, [rsp+14*16] + TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8 +%else + TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \ + [rsp+14*16], [tmpq+strideq*1], a, a +%endif + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + mova [tmpq+strideq*0], m0 +%if ARCH_X86_64 + mova [tmpq+strideq*1], m3 +%endif + mova [tmpq+strideq*2], m5 + mova [tmpq+stride3q ], m7 + lea dstq, [dstq+strideq*8] +%endif ; %1==/!=8 +%endif ; %2==/!=v +%elif %1 == 6 + ; flat6 filter + paddw m3, P1, P0 ; p1+p0 + paddw m3, P2 ; p2+p1+p0 + paddw m6, P2, Q0 ; p2+q0 + paddw m3, m3 ; 2*(p2+p1+p0) + paddw m3, m6 ; p2+2*(p2+p1+p0)+q0 + pmulhrsw m2, m3, [PIC_sym(pw_4096)] + psubw m2, P1 + pand m2, m4 + + paddw m3, Q0 ; p2+2*(p2+p1+p0+q0) + paddw m6, P2, P2 ; 2*p2 + paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1 + psubw m3, m6 ; p2+2*(p1+p0+q0)+q1 + pmulhrsw m5, m3, [PIC_sym(pw_4096)] + psubw m5, P0 + pand m5, m4 + + paddw m3, Q1 ; p2+2*(p1+p0+q0+q1) + paddw m6, P2, P1 ; p2+p1 + paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2 + psubw m3, m6 ; p1+2*(p0+q0+q1)+q2 + pmulhrsw m6, m3, [PIC_sym(pw_4096)] + psubw m6, Q0 + pand m6, m4 + + psubw m3, P1 ; 2*(p0+q0+q1)+q2 +%if ARCH_X86_64 + paddw Q2, Q2 ; q2*2 +%else + mova m0, Q2 + paddw m0, m0 +%endif + psubw m3, P0 ; p0+2*(q0+q1)+q2 +%if ARCH_X86_64 + paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2 +%else + paddw m3, m0 +%endif + pmulhrsw m3, [PIC_sym(pw_4096)] + psubw m3, Q1 + pand m3, m4 + + paddw m2, P1 + paddw m5, P0 + paddw m6, Q0 + paddw m3, Q1 + +%ifidn %2, v + mova [dstq+mstrideq*2], m2 ; p1 + mova [dstq+mstrideq*1], m5 ; p0 + mova [dstq+strideq*0], m6 ; q0 + mova [dstq+strideq*1], m3 ; q1 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0 +%endif ; %2==/!=v +%else ; %1 == 4 +%if ARCH_X86_64 +%ifidn %2, v + mova [dstq+mstrideq*2], P1 ; p1 + mova [dstq+mstrideq*1], P0 ; p0 + mova [dstq+strideq*0], Q0 ; q0 + mova [dstq+strideq*1], Q1 ; q1 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0 +%endif ; %2==/!=v +%else ; x86-32 +%ifidn %2, v + mova [dstq+mstrideq*2], m3 + mova [dstq+mstrideq*1], m5 + mova [dstq+strideq*0], m6 + mova [dstq+strideq*1], m7 +%else ; %2 != v + TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0 +%endif ; %2==/!=v +%endif ; x86-32/64 +%endif ; %1 +%undef P3 +%undef P2 +%undef P1 +%undef P0 +%undef Q0 +%undef Q1 +%undef Q2 +%undef Q3 +%endmacro + +INIT_XMM ssse3 +; stack layout: +; r0 - flat8 backup inside flat16 code +%if ARCH_X86_64 +cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq +%else +; stack layout [32bit only]: +; r1-4 - p2-q0 post-filter16 +; r5 - p3 +; r6 - q3 post-filter16 +; r7 - GPRs [mask_bitsm, mstridem] +; r8 - m12/pb_mask +; r9 - bdmulq +cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \ + dst, stride, mask, mstride, pic_reg, stride3, tmp + RELOC_ARGS v, 10*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base +%define pic_regm dword [esp+7*16+2*gprsize] + mov pic_regm, pic_regq + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+9*16 + mova [bdmulq], m0 + shl dword lstridem, 2 + sub r3, dword lstridem + mov dword lm, r3 +%endif + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mstridem dword [esp+7*16+1*gprsize] + mov mstridem, mstrideq +%define mask_bitsm dword [esp+7*16+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+8*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+8], mask_bitsd ; vmask[2] +%else + mov r6d, mask_bitsm + test [maskq+8], r6d +%endif + jz .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 8, v + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .end + + FILTER 4, v + +.end: +%if ARCH_X86_64 + pslld m12, 2 + add lq, 8 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add dword lm, 8 +%endif + add dstq, 16 +%if ARCH_X86_64 + shl mask_bitsd, 2 + sub wd, 2 +%else + shl mask_bitsm, 2 + sub dword wm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +; stack layout: +; r0 - flat8 backup inside flat16 +; r1-4 - p2-q0 post-filter16 backup +; r5 - q3 post-filter16 backup +; r6 - p3 +; r7-10 - p7-4 +; r11-14 - q4-7 +%if ARCH_X86_64 +cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov hd, hm + shl l_strideq, 2 +%else +; stack layout [32bit only]: +; r15 - GPRs [mask_bitsm] +; r16 - m12/pb_mask +; r17 - bdmulq +; r18-24 - p2-q3 +cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \ + dst, stride, mask, l, pic_reg, stride3, tmp + RELOC_ARGS h, 25*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+17*16 + mova [bdmulq], m0 + shl dword lstridem, 2 +%endif + sub lq, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+15*16+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+16*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+8], mask_bitsd ; vmask[2] +%else + mov r6d, mask_bitsm + test [maskq+8], r6d +%endif + jz .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 8, h + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] +.end: +%if ARCH_X86_64 + pslld m12, 2 + lea lq, [lq+l_strideq*2] + shl mask_bitsd, 2 + sub hd, 2 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add lq, dword lstridem + add lq, dword lstridem + shl mask_bitsm, 2 + sub dword hm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov wd, wm + shl l_strideq, 2 + sub lq, l_strideq +%else +; stack layout [32bit only]: +; r0 - GPRs [mask_bitsm, mstridem] +; r1 - m12/pb_mask +; r2 - bdmulq +cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \ + dst, stride, mask, mstride, pic_reg, stride3, tmp + RELOC_ARGS v, 3*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+2*16 + mova [bdmulq], m0 + shl dword lstridem, 2 + sub r3, dword lstridem + mov dword lm, r3 +%endif + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+0*gprsize] +%define mstridem dword [esp+1*gprsize] + mov mask_bitsm, 0x3 + mov mstridem, mstrideq + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+1*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + mov r6d, mask_bitsm + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 6, v + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .end + + FILTER 4, v + +.end: +%if ARCH_X86_64 + pslld m12, 2 + add lq, 8 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add dword lm, 8 +%endif + add dstq, 16 +%if ARCH_X86_64 + shl mask_bitsd, 2 + sub wd, 2 +%else + shl mask_bitsm, 2 + sub dword wm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, tmp, mask_bits, bdmul + mov r6d, r7m + sar r6d, 7 + and r6d, 16 ; 0 for 10bpc, 16 for 12bpc + lea bdmulq, [pw_4] + add bdmulq, r6 + mov hd, hm + shl l_strideq, 2 +%else +; stack layout [32bit only]: +; r0 - GPRs [mask_bitsm] +; r1 - m12/pb_mask +; r2 - bdmulq +; r3-8 - p2-q2 +cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \ + dst, stride, mask, l, pic_reg, stride3, tmp + RELOC_ARGS h, 9*16 +%if STACK_ALIGNMENT >= 16 + mov r5d, r7m +%endif + sar r5d, 7 + and r5d, 16 ; 0 for 10bpc, 16 for 12bpc + LEA pic_regq, PIC_base + mova m0, [PIC_sym(pw_4)+r5] +%define bdmulq esp+2*16 + mova [bdmulq], m0 + shl dword lstridem, 2 +%endif + sub lq, 4 + lea stride3q, [strideq*3] +%if ARCH_X86_64 + mov mask_bitsd, 0x3 + mova m12, [pb_mask] +%else +%define mask_bitsm dword [esp+0*gprsize] + mov mask_bitsm, 0x3 + mova m0, [PIC_sym(pb_mask)] +%define m12 [esp+1*16] + mova m12, m0 +%endif + +.loop: +%if ARCH_X86_64 + test [maskq+4], mask_bitsd ; vmask[1] +%else + mov r6d, mask_bitsm + test [maskq+4], r6d +%endif + jz .no_flat + + FILTER 6, h + jmp .end + +.no_flat: +%if ARCH_X86_64 + test [maskq+0], mask_bitsd ; vmask[0] +%else + test [maskq+0], r6d +%endif + jz .no_filter + + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] +.end: +%if ARCH_X86_64 + pslld m12, 2 + lea lq, [lq+l_strideq*2] + shl mask_bitsd, 2 + sub hd, 2 +%else + mova m0, m12 + pslld m0, 2 + mova m12, m0 + add lq, dword lstridem + add lq, dword lstridem + shl mask_bitsm, 2 + sub dword hm, 2 +%endif + jg .loop +%undef mask_bitsm +%undef bdmulq + UNRELOC_ARGS + RET diff --git a/third_party/dav1d/src/x86/loopfilter_avx2.asm b/third_party/dav1d/src/x86/loopfilter_avx2.asm new file mode 100644 index 0000000000..84696c758a --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter_avx2.asm @@ -0,0 +1,1569 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 +pb_7_1: times 16 db 7, 1 +pb_3_1: times 16 db 3, 1 +pb_2_1: times 16 db 2, 1 +pb_m1_0: times 16 db -1, 0 +pb_m1_1: times 16 db -1, 1 +pb_m1_2: times 16 db -1, 2 +pb_1: times 32 db 1 +pb_2: times 32 db 2 +pb_3: times 32 db 3 +pb_4: times 32 db 4 +pb_16: times 32 db 16 +pb_63: times 32 db 63 +pb_64: times 32 db 64 +pb_128: times 32 db 0x80 +pb_129: times 32 db 0x81 +pb_240: times 32 db 0xf0 +pb_248: times 32 db 0xf8 +pb_254: times 32 db 0xfe + +pw_2048: times 16 dw 2048 +pw_4096: times 16 dw 4096 + +pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128 + +SECTION .text + +%macro ABSSUB 4 ; dst, a, b, tmp + psubusb %1, %2, %3 + psubusb %4, %3, %2 + por %1, %4 +%endmacro + +%macro TRANSPOSE_16x4_AND_WRITE_4x32 5 + ; transpose 16x4 + punpcklbw m%5, m%1, m%2 + punpckhbw m%1, m%2 + punpcklbw m%2, m%3, m%4 + punpckhbw m%3, m%4 + punpcklwd m%4, m%5, m%2 + punpckhwd m%5, m%2 + punpcklwd m%2, m%1, m%3 + punpckhwd m%1, m%3 + + ; write out + movd [dstq+strideq*0-2], xm%4 + pextrd [dstq+strideq*1-2], xm%4, 1 + pextrd [dstq+strideq*2-2], xm%4, 2 + pextrd [dstq+stride3q-2], xm%4, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%5 + pextrd [dstq+strideq*1-2], xm%5, 1 + pextrd [dstq+strideq*2-2], xm%5, 2 + pextrd [dstq+stride3q-2], xm%5, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%2 + pextrd [dstq+strideq*1-2], xm%2, 1 + pextrd [dstq+strideq*2-2], xm%2, 2 + pextrd [dstq+stride3q-2], xm%2, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%1 + pextrd [dstq+strideq*1-2], xm%1, 1 + pextrd [dstq+strideq*2-2], xm%1, 2 + pextrd [dstq+stride3q-2], xm%1, 3 + lea dstq, [dstq+strideq*4] + + vextracti128 xm%4, m%4, 1 + vextracti128 xm%5, m%5, 1 + vextracti128 xm%2, m%2, 1 + vextracti128 xm%1, m%1, 1 + + movd [dstq+strideq*0-2], xm%4 + pextrd [dstq+strideq*1-2], xm%4, 1 + pextrd [dstq+strideq*2-2], xm%4, 2 + pextrd [dstq+stride3q-2], xm%4, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%5 + pextrd [dstq+strideq*1-2], xm%5, 1 + pextrd [dstq+strideq*2-2], xm%5, 2 + pextrd [dstq+stride3q-2], xm%5, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%2 + pextrd [dstq+strideq*1-2], xm%2, 1 + pextrd [dstq+strideq*2-2], xm%2, 2 + pextrd [dstq+stride3q-2], xm%2, 3 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0-2], xm%1 + pextrd [dstq+strideq*1-2], xm%1, 1 + pextrd [dstq+strideq*2-2], xm%1, 2 + pextrd [dstq+stride3q-2], xm%1, 3 + lea dstq, [dstq+strideq*4] +%endmacro + +%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem +%if %1 == 0 + mova %3, m15 +%endif + + ; input in m0-15 + punpcklbw m15, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklbw m7, m8, m9 + punpckhbw m8, m9 + punpcklbw m9, m10, m11 + punpckhbw m10, m11 + punpcklbw m11, m12, m13 + punpckhbw m12, m13 + mova m13, %3 + mova %3, m12 + punpcklbw m12, m14, m13 + punpckhbw m13, m14, m13 + + ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 + punpcklwd m14, m15, m1 + punpckhwd m15, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m7, m9 + punpckhwd m7, m9 + punpcklwd m9, m8, m10 + punpckhwd m8, m10 + punpcklwd m10, m11, m12 + punpckhwd m11, m12 + mova m12, %3 + mova %3, m11 + punpcklwd m11, m12, m13 + punpckhwd m12, m13 + + ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 + punpckldq m13, m14, m2 + punpckhdq m14, m2 + punpckldq m2, m15, m3 + punpckhdq m15, m3 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + punpckldq m5, m0, m4 + punpckhdq m0, m4 + punpckldq m4, m6, m10 + punpckhdq m6, m10 + punpckldq m10, m9, m11 + punpckhdq m9, m11 + punpckldq m11, m8, m12 + punpckhdq m8, m12 + mova m12, %3 + mova %3, m8 + punpckldq m8, m7, m12 + punpckhdq m7, m12 + + ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 + punpcklqdq m12, m13, m4 + punpckhqdq m13, m4 + punpcklqdq m4, m14, m6 + punpckhqdq m14, m6 + punpcklqdq m6, m2, m8 + punpckhqdq m2, m8 + punpcklqdq m8, m15, m7 + punpckhqdq m15, m7 + punpcklqdq m7, m3, m10 + punpckhqdq m3, m10 + punpcklqdq m10, m1, m9 + punpckhqdq m1, m9 + punpcklqdq m9, m5, m11 + punpckhqdq m5, m11 + mova m11, %3 + mova %3, m12 + punpcklqdq m12, m0, m11 + punpckhqdq m0, m11 +%if %2 == 0 + mova m11, %3 +%endif + + ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 + SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 + SWAP 3, 14, 12, 9 +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + mova m3, [tmpq+strideq*0] ; p1 + mova m4, [tmpq+strideq*1] ; p0 + mova m5, [tmpq+strideq*2] ; q0 + mova m6, [tmpq+stride3q] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] +%if %1 != 6 + mova m12, [tmpq+strideq*0] +%endif + mova m13, [tmpq+strideq*1] + mova m3, [tmpq+strideq*2] + mova m4, [tmpq+stride3q] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q] +%endif +%endif +%else + ; load lines +%if %1 == 4 + movd xm3, [dstq+strideq*0-2] + movd xm4, [dstq+strideq*1-2] + movd xm5, [dstq+strideq*2-2] + movd xm6, [dstq+stride3q -2] + lea tmpq, [dstq+strideq*4] + pinsrd xm3, [tmpq+strideq*0-2], 2 + pinsrd xm4, [tmpq+strideq*1-2], 2 + pinsrd xm5, [tmpq+strideq*2-2], 2 + pinsrd xm6, [tmpq+stride3q -2], 2 + lea tmpq, [tmpq+strideq*4] + pinsrd xm3, [tmpq+strideq*0-2], 1 + pinsrd xm4, [tmpq+strideq*1-2], 1 + pinsrd xm5, [tmpq+strideq*2-2], 1 + pinsrd xm6, [tmpq+stride3q -2], 1 + lea tmpq, [tmpq+strideq*4] + pinsrd xm3, [tmpq+strideq*0-2], 3 + pinsrd xm4, [tmpq+strideq*1-2], 3 + pinsrd xm5, [tmpq+strideq*2-2], 3 + pinsrd xm6, [tmpq+stride3q -2], 3 + lea tmpq, [tmpq+strideq*4] + movd xm12, [tmpq+strideq*0-2] + movd xm13, [tmpq+strideq*1-2] + movd xm14, [tmpq+strideq*2-2] + movd xm15, [tmpq+stride3q -2] + lea tmpq, [tmpq+strideq*4] + pinsrd xm12, [tmpq+strideq*0-2], 2 + pinsrd xm13, [tmpq+strideq*1-2], 2 + pinsrd xm14, [tmpq+strideq*2-2], 2 + pinsrd xm15, [tmpq+stride3q -2], 2 + lea tmpq, [tmpq+strideq*4] + pinsrd xm12, [tmpq+strideq*0-2], 1 + pinsrd xm13, [tmpq+strideq*1-2], 1 + pinsrd xm14, [tmpq+strideq*2-2], 1 + pinsrd xm15, [tmpq+stride3q -2], 1 + lea tmpq, [tmpq+strideq*4] + pinsrd xm12, [tmpq+strideq*0-2], 3 + pinsrd xm13, [tmpq+strideq*1-2], 3 + pinsrd xm14, [tmpq+strideq*2-2], 3 + pinsrd xm15, [tmpq+stride3q -2], 3 + vinserti128 m3, xm12, 1 + vinserti128 m4, xm13, 1 + vinserti128 m5, xm14, 1 + vinserti128 m6, xm15, 1 + + ; transpose 4x16 + ; xm3: A-D0,A-D8,A-D4,A-D12 + ; xm4: A-D1,A-D9,A-D5,A-D13 + ; xm5: A-D2,A-D10,A-D6,A-D14 + ; xm6: A-D3,A-D11,A-D7,A-D15 + punpcklbw m7, m3, m4 + punpckhbw m3, m4 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 + ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 + ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 + ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 + punpcklwd m6, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + ; xm6: A0-3,B0-3,C0-3,D0-3 + ; xm7: A8-11,B8-11,C8-11,D8-11 + ; xm4: A4-7,B4-7,C4-7,D4-7 + ; xm3: A12-15,B12-15,C12-15,D12-15 + punpckldq m5, m6, m4 + punpckhdq m6, m4 + punpckldq m4, m7, m3 + punpckhdq m7, m3 + ; xm5: A0-7,B0-7 + ; xm6: C0-7,D0-7 + ; xm4: A8-15,B8-15 + ; xm7: C8-15,D8-15 + punpcklqdq m3, m5, m4 + punpckhqdq m4, m5, m4 + punpcklqdq m5, m6, m7 + punpckhqdq m6, m7 + ; xm3: A0-15 + ; xm5: B0-15 + ; xm4: C0-15 + ; xm6: D0-15 +%elif %1 == 6 || %1 == 8 + movq xm3, [dstq+strideq*0-%1/2] + movq xm4, [dstq+strideq*1-%1/2] + movq xm5, [dstq+strideq*2-%1/2] + movq xm6, [dstq+stride3q -%1/2] + lea tmpq, [dstq+strideq*8] + movhps xm3, [tmpq+strideq*0-%1/2] + movhps xm4, [tmpq+strideq*1-%1/2] + movhps xm5, [tmpq+strideq*2-%1/2] + movhps xm6, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movq xm7, [tmpq+strideq*0-%1/2] + movq xm8, [tmpq+strideq*1-%1/2] + movq xm9, [tmpq+strideq*2-%1/2] + movq xm11, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movhps xm7, [tmpq+strideq*0-%1/2] + movhps xm8, [tmpq+strideq*1-%1/2] + movhps xm9, [tmpq+strideq*2-%1/2] + movhps xm11, [tmpq+stride3q -%1/2] + vinserti128 m3, xm7, 1 + vinserti128 m4, xm8, 1 + vinserti128 m5, xm9, 1 + vinserti128 m6, xm11, 1 + lea tmpq, [dstq+strideq*4] + movq xm12, [tmpq+strideq*0-%1/2] + movq xm13, [tmpq+strideq*1-%1/2] + movq xm14, [tmpq+strideq*2-%1/2] + movq xm15, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movhps xm12, [tmpq+strideq*0-%1/2] + movhps xm13, [tmpq+strideq*1-%1/2] + movhps xm14, [tmpq+strideq*2-%1/2] + movhps xm15, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movq xm7, [tmpq+strideq*0-%1/2] + movq xm8, [tmpq+strideq*1-%1/2] + movq xm9, [tmpq+strideq*2-%1/2] + movq xm11, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + movhps xm7, [tmpq+strideq*0-%1/2] + movhps xm8, [tmpq+strideq*1-%1/2] + movhps xm9, [tmpq+strideq*2-%1/2] + movhps xm11, [tmpq+stride3q -%1/2] + vinserti128 m12, xm7, 1 + vinserti128 m13, xm8, 1 + vinserti128 m14, xm9, 1 + vinserti128 m15, xm11, 1 + + ; transpose 8x16 + ; xm3: A-H0,A-H8 + ; xm4: A-H1,A-H9 + ; xm5: A-H2,A-H10 + ; xm6: A-H3,A-H11 + ; xm12: A-H4,A-H12 + ; xm13: A-H5,A-H13 + ; xm14: A-H6,A-H14 + ; xm15: A-H7,A-H15 + punpcklbw m7, m3, m4 + punpckhbw m3, m4 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + punpcklbw m6, m12, m13 + punpckhbw m12, m13 + punpcklbw m13, m14, m15 + punpckhbw m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 + ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 + ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 + ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 + ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 + ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 + ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 + ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 + punpcklwd m15, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m6, m13 + punpckhwd m6, m13 + punpcklwd m13, m12, m14 + punpckhwd m12, m14 + ; xm15: A0-3,B0-3,C0-3,D0-3 + ; xm7: E0-3,F0-3,G0-3,H0-3 + ; xm4: A8-11,B8-11,C8-11,D8-11 + ; xm3: E8-11,F8-11,G8-11,H8-11 + ; xm5: A4-7,B4-7,C4-7,D4-7 + ; xm6: E4-7,F4-7,G4-7,H4-7 + ; xm13: A12-15,B12-15,C12-15,D12-15 + ; xm12: E12-15,F12-15,G12-15,H12-15 + punpckldq m14, m15, m5 + punpckhdq m15, m5 + punpckldq m5, m7, m6 +%if %1 != 6 + punpckhdq m7, m6 +%endif + punpckldq m6, m4, m13 + punpckhdq m4, m13 + punpckldq m13, m3, m12 +%if %1 != 6 + punpckhdq m12, m3, m12 +%endif + ; xm14: A0-7,B0-7 + ; xm15: C0-7,D0-7 + ; xm5: E0-7,F0-7 + ; xm7: G0-7,H0-7 + ; xm6: A8-15,B8-15 + ; xm4: C8-15,D8-15 + ; xm13: E8-15,F8-15 + ; xm12: G8-15,H8-15 + punpcklqdq m3, m14, m6 + punpckhqdq m14, m6 + punpckhqdq m6, m15, m4 + punpcklqdq m15, m4 + punpcklqdq m4, m5, m13 + punpckhqdq m13, m5, m13 +%if %1 == 8 + punpcklqdq m5, m7, m12 + punpckhqdq m12, m7, m12 + ; xm3: A0-15 + ; xm14: B0-15 + ; xm15: C0-15 + ; xm6: D0-15 + ; xm4: E0-15 + ; xm13: F0-15 + ; xm5: G0-15 + ; xm12: H0-15 + SWAP 12, 3, 15 + SWAP 13, 14, 5, 4, 6 + ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 +%else + SWAP 13, 3, 14 + SWAP 6, 4, 15, 5 + ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 +%endif +%else + ; load and 16x16 transpose. We only use 14 pixels but we'll need the + ; remainder at the end for the second transpose + movu xm0, [dstq+strideq*0-8] + movu xm1, [dstq+strideq*1-8] + movu xm2, [dstq+strideq*2-8] + movu xm3, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu xm4, [tmpq+strideq*0-8] + movu xm5, [tmpq+strideq*1-8] + movu xm6, [tmpq+strideq*2-8] + movu xm7, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu xm8, [tmpq+strideq*0-8] + movu xm9, [tmpq+strideq*1-8] + movu xm10, [tmpq+strideq*2-8] + movu xm11, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu xm12, [tmpq+strideq*0-8] + movu xm13, [tmpq+strideq*1-8] + movu xm14, [tmpq+strideq*2-8] + movu xm15, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + vinserti128 m0, [tmpq+strideq*0-8], 1 + vinserti128 m1, [tmpq+strideq*1-8], 1 + vinserti128 m2, [tmpq+strideq*2-8], 1 + vinserti128 m3, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m4, [tmpq+strideq*0-8], 1 + vinserti128 m5, [tmpq+strideq*1-8], 1 + vinserti128 m6, [tmpq+strideq*2-8], 1 + vinserti128 m7, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m8, [tmpq+strideq*0-8], 1 + vinserti128 m9, [tmpq+strideq*1-8], 1 + vinserti128 m10, [tmpq+strideq*2-8], 1 + vinserti128 m11, [tmpq+stride3q -8], 1 + lea tmpq, [tmpq+strideq*4] + vinserti128 m12, [tmpq+strideq*0-8], 1 + vinserti128 m13, [tmpq+strideq*1-8], 1 + vinserti128 m14, [tmpq+strideq*2-8], 1 + vinserti128 m15, [tmpq+stride3q -8], 1 + + TRANSPOSE_16X16B 0, 1, [rsp+11*32] + mova [rsp+12*32], m1 + mova [rsp+13*32], m2 + mova [rsp+14*32], m3 + mova [rsp+15*32], m12 + mova [rsp+16*32], m13 + mova [rsp+17*32], m14 + mova [rsp+18*32], m15 + ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 + SWAP 12, 4, 7 + SWAP 13, 5, 8 + SWAP 3, 6, 9 + SWAP 10, 14 + SWAP 11, 15 +%endif +%endif + + ; load L/E/I/H +%ifidn %2, v + movu m1, [lq] + movu m0, [lq+l_strideq] +%else + movq xm1, [lq] + movq xm2, [lq+l_strideq*2] + movhps xm1, [lq+l_strideq] + movhps xm2, [lq+l_stride3q] + lea lq, [lq+l_strideq*4] + movq xm10, [lq] + movq xm0, [lq+l_strideq*2] + movhps xm10, [lq+l_strideq] + movhps xm0, [lq+l_stride3q] + lea lq, [lq+l_strideq*4] + vinserti128 m1, xm10, 1 + vinserti128 m2, xm0, 1 + shufps m0, m1, m2, q3131 + shufps m1, m2, q2020 +%endif + pxor m2, m2 + pcmpeqb m10, m2, m0 + pand m1, m10 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] + pcmpeqb m10, m2, m0 ; !L + psrlq m2, m0, [lutq+128] + pand m2, [pb_63] + vpbroadcastb m1, [lutq+136] + pminub m2, m1 + pmaxub m2, [pb_1] ; I + pand m1, m0, [pb_240] + psrlq m1, 4 ; H + paddb m0, [pb_2] + paddb m0, m0 + paddb m0, m2 ; E + pxor m1, [pb_128] + pxor m2, [pb_128] + pxor m0, [pb_128] + + ABSSUB m8, m3, m4, m9 ; abs(p1-p0) + pmaxub m8, m10 + ABSSUB m9, m5, m6, m10 ; abs(q1-q0) + pmaxub m8, m9 +%if %1 == 4 + pxor m8, [pb_128] + pcmpgtb m7, m8, m1 ; hev +%else + pxor m7, m8, [pb_128] + pcmpgtb m7, m1 ; hev + +%if %1 == 6 + ABSSUB m9, m13, m4, m10 ; abs(p2-p0) + pmaxub m9, m8 +%else + ABSSUB m9, m12, m4, m10 ; abs(p3-p0) + pmaxub m9, m8 + ABSSUB m10, m13, m4, m11 ; abs(p2-p0) + pmaxub m9, m10 +%endif + ABSSUB m10, m5, m14, m11 ; abs(q2-q0) + pmaxub m9, m10 +%if %1 != 6 + ABSSUB m10, m5, m15, m11 ; abs(q3-q0) + pmaxub m9, m10 +%endif + pxor m9, [pb_128] + pcmpgtb m9, [pb_129] ; !flat8in + +%if %1 == 6 + ABSSUB m10, m13, m3, m1 ; abs(p2-p1) +%else + ABSSUB m10, m12, m13, m11 ; abs(p3-p2) + ABSSUB m11, m13, m3, m1 ; abs(p2-p1) + pmaxub m10, m11 + ABSSUB m11, m14, m15, m1 ; abs(q3-q2) + pmaxub m10, m11 +%endif + ABSSUB m11, m14, m6, m1 ; abs(q2-q1) + pmaxub m10, m11 +%if %1 == 16 + vpbroadcastd m11, [maskq+8] + vpbroadcastd m1, [maskq+4] + por m11, m1 + pand m11, [pb_mask] + pcmpeqd m11, [pb_mask] + pand m10, m11 +%else + vpbroadcastd m11, [maskq+4] + pand m11, [pb_mask] + pcmpeqd m11, [pb_mask] + pand m10, m11 ; only apply fm-wide to wd>4 blocks +%endif + pmaxub m8, m10 + + pxor m8, [pb_128] +%endif + pcmpgtb m8, m2 + + ABSSUB m10, m3, m6, m11 ; abs(p1-q1) + ABSSUB m11, m4, m5, m2 ; abs(p0-q0) + paddusb m11, m11 + pand m10, [pb_254] + psrlq m10, 1 + paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pxor m10, [pb_128] + pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E + por m8, m10 + +%if %1 == 16 +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+12*32] +%endif + ABSSUB m1, m0, m4, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+13*32] +%endif + ABSSUB m2, m0, m4, m10 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+stride3q] +%else + mova m0, [rsp+14*32] +%endif + ABSSUB m2, m0, m4, m10 + pmaxub m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] +%else + mova m0, [rsp+15*32] +%endif + ABSSUB m2, m0, m5, m10 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+16*32] +%endif + ABSSUB m2, m0, m5, m10 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+17*32] +%endif + ABSSUB m2, m0, m5, m10 + pmaxub m1, m2 + pxor m1, [pb_128] + pcmpgtb m1, [pb_129] ; !flat8out + por m1, m9 ; !flat8in | !flat8out + vpbroadcastd m2, [maskq+8] + pand m10, m2, [pb_mask] + pcmpeqd m10, [pb_mask] + pandn m1, m10 ; flat16 + pandn m1, m8, m1 ; flat16 & fm + + vpbroadcastd m10, [maskq+4] + por m10, m2 + pand m2, m10, [pb_mask] + pcmpeqd m2, [pb_mask] + pandn m9, m2 ; flat8in + pandn m9, m8, m9 + vpbroadcastd m2, [maskq+0] + por m2, m10 + pand m2, [pb_mask] + pcmpeqd m2, [pb_mask] + pandn m8, m2 + pandn m8, m9, m8 ; fm & !flat8 & !flat16 + pandn m9, m1, m9 ; flat8 & !flat16 +%elif %1 != 4 + vpbroadcastd m0, [maskq+4] + pand m2, m0, [pb_mask] + pcmpeqd m2, [pb_mask] + pandn m9, m2 + pandn m9, m8, m9 ; flat8 & fm + vpbroadcastd m2, [maskq+0] + por m0, m2 + pand m0, [pb_mask] + pcmpeqd m0, [pb_mask] + pandn m8, m0 + pandn m8, m9, m8 ; fm & !flat8 +%else + vpbroadcastd m0, [maskq+0] + pand m0, [pb_mask] + pcmpeqd m0, [pb_mask] + pandn m8, m0 ; fm +%endif + + ; short filter + + pxor m3, [pb_128] + pxor m6, [pb_128] + psubsb m10, m3, m6 ; iclip_diff(p1-q1) + pand m10, m7 ; f=iclip_diff(p1-q1)&hev + pxor m4, [pb_128] + pxor m5, [pb_128] + psubsb m11, m5, m4 + paddsb m10, m11 + paddsb m10, m11 + paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f) + pand m8, m10 ; f&=fm + paddsb m10, m8, [pb_3] + paddsb m8, [pb_4] + pand m10, [pb_248] + pand m8, [pb_248] + psrlq m10, 3 + psrlq m8, 3 + pxor m10, [pb_16] + pxor m8, [pb_16] + psubb m10, [pb_16] ; f2 + psubb m8, [pb_16] ; f1 + paddsb m4, m10 + psubsb m5, m8 + pxor m4, [pb_128] + pxor m5, [pb_128] + + pxor m8, [pb_128] + pxor m10, m10 + pavgb m8, m10 ; f=(f1+1)>>1 + psubb m8, [pb_64] + pandn m8, m7, m8 ; f&=!hev + paddsb m3, m8 + psubsb m6, m8 + pxor m3, [pb_128] + pxor m6, [pb_128] + +%if %1 == 16 + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 +%else + mova m0, [rsp+12*32] + mova m2, [rsp+13*32] + mova m7, [rsp+14*32] +%endif + + mova [rsp+0*32], m9 + mova [rsp+1*32], m14 + mova [rsp+2*32], m15 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A + ; write -6 + punpcklbw m14, m0, m12 + punpckhbw m15, m0, m12 + pmaddubsw m10, m14, [pb_7_1] + pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3 + punpcklbw m8, m2, m7 + punpckhbw m9, m2, m7 + pmaddubsw m8, [pb_2] + pmaddubsw m9, [pb_2] + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3 + punpcklbw m8, m13, m3 + punpckhbw m9, m13, m3 + pmaddubsw m8, [pb_1] + pmaddubsw m9, [pb_1] + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 + punpcklbw m8, m4, m5 + punpckhbw m9, m4, m5 + pmaddubsw m8, [pb_1] + pmaddubsw m9, [pb_1] + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + pand m8, m1 + pandn m9, m1, m2 + por m8, m9 +%ifidn %2, v + mova [tmpq+strideq*2], m8 ; p5 +%else + mova [rsp+13*32], m8 +%endif + + ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B + ; write -5 + pmaddubsw m14, [pb_m1_1] + pmaddubsw m15, [pb_m1_1] + paddw m10, m14 + paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m8, m0, m6 + punpckhbw m9, m0, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+3*32], m8 + mova [rsp+4*32], m9 + paddw m10, m8 + paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m7, m8, m1 +%ifidn %2, v + mova [tmpq+stride3q], m8 ; p4 +%else + mova [rsp+14*32], m8 +%endif + + ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C + ; write -4 + mova m14, [rsp+1*32] + punpcklbw m8, m0, m13 + punpckhbw m9, m0, m13 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 + punpcklbw m8, m2, m14 + punpckhbw m2, m14 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m2, [pb_m1_1] + mova [rsp+1*32], m8 + paddw m10, m8 + paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m12, m8, m1 +%ifidn %2, v + mova [tmpq+strideq*4], m8 ; p3 +%else + mova [rsp+19*32], m8 +%endif + + ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D + ; write -3 + mova m15, [rsp+2*32] + punpcklbw m8, m0, m3 + punpckhbw m9, m0, m3 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 + punpcklbw m8, m7, m15 + punpckhbw m7, m15 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m7, [pb_m1_1] + mova [rsp+2*32], m8 + paddw m10, m8 + paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m13, m8, m1 + mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F + + ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E + ; write -2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] +%endif + punpcklbw m8, m0, m4 + punpckhbw m9, m0, m4 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 +%ifidn %2, v + mova m9, [tmpq+strideq*0] ; q4 +%else + mova m9, [rsp+15*32] +%endif + punpcklbw m8, m12, m9 + punpckhbw m9, m12, m9 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+7*32], m8 + mova [rsp+5*32], m9 + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m3, m8, m1 + mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G + + ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F + ; write -1 +%ifidn %2, v + mova m9, [tmpq+strideq*1] ; q5 +%else + mova m9, [rsp+16*32] +%endif + punpcklbw m8, m0, m5 + punpckhbw m0, m5 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m0, [pb_m1_1] + paddw m10, m8 + paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m0, m13, m9 + punpckhbw m9, m13, m9 + mova m13, [rsp+6*32] + pmaddubsw m0, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+ 9*32], m0 + mova [rsp+10*32], m9 + paddw m10, m0 + paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 + pmulhrsw m0, m10, [pw_2048] + pmulhrsw m8, m11, [pw_2048] + packuswb m0, m8 + vpblendvb m0, m4, m0, m1 + mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H + + ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G + ; write +0 +%ifidn %2, v + mova m0, [tmpq+strideq*2] ; q6 +%else + mova m0, [rsp+17*32] +%endif + paddw m10, [rsp+3*32] + paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 + punpcklbw m8, m3, m0 + punpckhbw m9, m3, m0 + mova m3, [rsp+8*32] + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + mova [rsp+3*32], m8 + mova [rsp+4*32], m9 + paddw m10, m8 + paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m5, m8, m1 + mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I + + ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H + ; write +1 + paddw m10, [rsp+1*32] + paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 + punpcklbw m8, m4, m0 + punpckhbw m2, m4, m0 + mova m4, [rsp+6*32] + pmaddubsw m8, [pb_m1_1] + pmaddubsw m2, [pb_m1_1] + paddw m10, m8 + paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 + pmulhrsw m2, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m2, m9 + vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K + + ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I + ; write +2 + paddw m10, [rsp+2*32] + paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + punpcklbw m8, m5, m0 + punpckhbw m9, m5, m0 + mova m5, [rsp+8*32] + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 + pmulhrsw m7, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m7, m9 + vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K + + ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J + ; write +3 + paddw m10, [rsp+7*32] + paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + punpcklbw m8, m6, m0 + punpckhbw m9, m6, m0 + SWAP 2, 6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 + vpblendvb m8, m15, m8, m1 +%ifidn %2, v + mova [tmpq+mstrideq], m8 ; q3 +%else + mova [rsp+20*32], m8 +%endif + + ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K + ; write +4 + paddw m10, [rsp+ 9*32] + paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m8, m14, m0 + punpckhbw m9, m14, m0 + SWAP 14, 7 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m8, m10, [pw_2048] + pmulhrsw m9, m11, [pw_2048] + packuswb m8, m9 +%ifidn %2, v + mova m9, [tmpq+strideq*0] +%else + mova m9, [rsp+15*32] +%endif + vpblendvb m8, m9, m8, m1 +%ifidn %2, v + mova [tmpq+strideq*0], m8 ; q4 +%else + mova [rsp+15*32], m8 +%endif + + ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L + ; write +5 + paddw m10, [rsp+3*32] + paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m8, m15, m0 + punpckhbw m9, m15, m0 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m9, [pb_m1_1] + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m10, [pw_2048] + pmulhrsw m11, [pw_2048] + packuswb m10, m11 +%ifidn %2, v + mova m11, [tmpq+strideq*1] +%else + mova m11, [rsp+16*32] +%endif + vpblendvb m10, m11, m10, m1 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; q5 +%else + mova [rsp+16*32], m10 +%endif + + mova m9, [rsp+0*32] +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%endif +%endif +%if %1 >= 8 + ; flat8 filter + punpcklbw m0, m12, m3 + punpckhbw m1, m12, m3 + pmaddubsw m2, m0, [pb_3_1] + pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1 + punpcklbw m8, m13, m4 + punpckhbw m11, m13, m4 + pmaddubsw m8, [pb_2_1] + pmaddubsw m11, [pb_2_1] + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + punpcklbw m8, m5, [pb_4] + punpckhbw m11, m5, [pb_4] + pmaddubsw m8, [pb_1] + pmaddubsw m11, [pb_1] + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendvb m10, m13, m8, m9 ; p2 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; p2 +%endif + + pmaddubsw m8, m0, [pb_m1_1] + pmaddubsw m11, m1, [pb_m1_1] + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m11, [pb_m1_1] + paddw m2, m8 + paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendvb m8, m3, m8, m9 ; p1 +%ifidn %2, v + mova [tmpq+strideq*2], m8 ; p1 +%else + mova [rsp+0*32], m8 +%endif + + pmaddubsw m0, [pb_1] + pmaddubsw m1, [pb_1] + psubw m2, m0 + psubw m7, m1 + punpcklbw m8, m4, m14 + punpckhbw m11, m4, m14 + pmaddubsw m8, [pb_1] + pmaddubsw m11, [pb_1] + paddw m2, m8 + paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendvb m8, m4, m8, m9 ; p0 +%ifidn %2, v + mova [tmpq+stride3q ], m8 ; p0 +%else + mova [rsp+1*32], m8 +%endif + + punpcklbw m0, m5, m15 + punpckhbw m1, m5, m15 + pmaddubsw m8, m0, [pb_1] + pmaddubsw m11, m1, [pb_1] + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m4, m12 + punpckhbw m11, m4, m12 + pmaddubsw m8, [pb_1] + pmaddubsw m11, [pb_1] + psubw m2, m8 + psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendvb m11, m5, m8, m9 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m11 ; q0 +%endif + + pmaddubsw m0, [pb_m1_1] + pmaddubsw m1, [pb_m1_1] + paddw m2, m0 + paddw m7, m1 + punpcklbw m8, m13, m6 + punpckhbw m13, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m13, [pb_m1_1] + paddw m2, m8 + paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 + psrlw m8, m2, 3 + psrlw m13, m7, 3 + packuswb m8, m13 + vpblendvb m13, m6, m8, m9 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m13 ; q1 +%endif + + punpcklbw m0, m3, m6 + punpckhbw m1, m3, m6 + pmaddubsw m0, [pb_1] + pmaddubsw m1, [pb_1] + psubw m2, m0 + psubw m7, m1 + punpcklbw m0, m14, m15 + punpckhbw m1, m14, m15 + pmaddubsw m0, [pb_1] + pmaddubsw m1, [pb_1] + paddw m2, m0 + paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + psrlw m2, 3 + psrlw m7, 3 + packuswb m2, m7 + vpblendvb m2, m14, m2, m9 ; q2 +%ifidn %2, v + mova [dstq+strideq*2], m2 ; q2 +%else + mova m0, [rsp+0*32] + mova m1, [rsp+1*32] +%if %1 == 8 + ; 16x8 transpose + punpcklbw m3, m12, m10 + punpckhbw m12, m10 + punpcklbw m10, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m11, m13 + punpckhbw m11, m13 + punpcklbw m13, m2, m15 + punpckhbw m2, m15 + + punpcklwd m15, m3, m10 + punpckhwd m3, m10 + punpcklwd m10, m12, m0 + punpckhwd m12, m0 + punpcklwd m0, m1, m13 + punpckhwd m1, m13 + punpcklwd m13, m11, m2 + punpckhwd m11, m2 + + punpckldq m2, m15, m0 + punpckhdq m15, m0 + punpckldq m0, m3, m1 + punpckhdq m3, m1 + punpckldq m1, m10, m13 + punpckhdq m10, m13 + punpckldq m13, m12, m11 + punpckhdq m12, m11 + + ; write 8x32 + movq [dstq+strideq*0-4], xm2 + movhps [dstq+strideq*1-4], xm2 + movq [dstq+strideq*2-4], xm15 + movhps [dstq+stride3q -4], xm15 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm0 + movhps [dstq+strideq*1-4], xm0 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm1 + movhps [dstq+strideq*1-4], xm1 + movq [dstq+strideq*2-4], xm10 + movhps [dstq+stride3q -4], xm10 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm13 + movhps [dstq+strideq*1-4], xm13 + movq [dstq+strideq*2-4], xm12 + movhps [dstq+stride3q -4], xm12 + lea dstq, [dstq+strideq*4] + + vextracti128 xm2, m2, 1 + vextracti128 xm15, m15, 1 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m3, 1 + vextracti128 xm1, m1, 1 + vextracti128 xm10, m10, 1 + vextracti128 xm13, m13, 1 + vextracti128 xm12, m12, 1 + + movq [dstq+strideq*0-4], xm2 + movhps [dstq+strideq*1-4], xm2 + movq [dstq+strideq*2-4], xm15 + movhps [dstq+stride3q -4], xm15 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm0 + movhps [dstq+strideq*1-4], xm0 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm1 + movhps [dstq+strideq*1-4], xm1 + movq [dstq+strideq*2-4], xm10 + movhps [dstq+stride3q -4], xm10 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm13 + movhps [dstq+strideq*1-4], xm13 + movq [dstq+strideq*2-4], xm12 + movhps [dstq+stride3q -4], xm12 + lea dstq, [dstq+strideq*4] +%else + ; 16x16 transpose and store + SWAP 5, 10, 2 + SWAP 6, 0 + SWAP 7, 1 + SWAP 8, 11 + SWAP 9, 13 + mova m0, [rsp+11*32] + mova m1, [rsp+12*32] + mova m2, [rsp+13*32] + mova m3, [rsp+14*32] + mova m4, [rsp+19*32] + mova m11, [rsp+20*32] + mova m12, [rsp+15*32] + mova m13, [rsp+16*32] + mova m14, [rsp+17*32] + TRANSPOSE_16X16B 1, 0, [rsp+18*32] + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm1 + movu [dstq+strideq*2-8], xm2 + movu [dstq+stride3q -8], xm3 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm4 + movu [dstq+strideq*1-8], xm5 + movu [dstq+strideq*2-8], xm6 + movu [dstq+stride3q -8], xm7 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm8 + movu [dstq+strideq*1-8], xm9 + movu [dstq+strideq*2-8], xm10 + movu [dstq+stride3q -8], xm11 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm12 + movu [dstq+strideq*1-8], xm13 + movu [dstq+strideq*2-8], xm14 + movu [dstq+stride3q -8], xm15 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m0, 1 + vextracti128 [dstq+strideq*1-8], m1, 1 + vextracti128 [dstq+strideq*2-8], m2, 1 + vextracti128 [dstq+stride3q -8], m3, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m4, 1 + vextracti128 [dstq+strideq*1-8], m5, 1 + vextracti128 [dstq+strideq*2-8], m6, 1 + vextracti128 [dstq+stride3q -8], m7, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m8, 1 + vextracti128 [dstq+strideq*1-8], m9, 1 + vextracti128 [dstq+strideq*2-8], m10, 1 + vextracti128 [dstq+stride3q -8], m11, 1 + lea dstq, [dstq+strideq*4] + vextracti128 [dstq+strideq*0-8], m12, 1 + vextracti128 [dstq+strideq*1-8], m13, 1 + vextracti128 [dstq+strideq*2-8], m14, 1 + vextracti128 [dstq+stride3q -8], m15, 1 + lea dstq, [dstq+strideq*4] +%endif +%endif +%elif %1 == 6 + ; flat6 filter + + punpcklbw m8, m13, m5 + punpckhbw m11, m13, m5 + pmaddubsw m0, m8, [pb_3_1] + pmaddubsw m1, m11, [pb_3_1] + punpcklbw m7, m4, m3 + punpckhbw m10, m4, m3 + pmaddubsw m2, m7, [pb_2] + pmaddubsw m12, m10, [pb_2] + paddw m0, m2 + paddw m1, m12 + pmulhrsw m2, m0, [pw_4096] + pmulhrsw m12, m1, [pw_4096] + packuswb m2, m12 + vpblendvb m2, m3, m2, m9 +%ifidn %2, v + mova [tmpq+strideq*2], m2 ; p1 +%endif + + pmaddubsw m8, [pb_m1_1] + pmaddubsw m11, [pb_m1_1] + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, [pb_m1_1] + pmaddubsw m11, [pb_m1_1] + paddw m0, m8 + paddw m1, m11 + pmulhrsw m12, m0, [pw_4096] + pmulhrsw m13, m1, [pw_4096] + packuswb m12, m13 + vpblendvb m12, m4, m12, m9 +%ifidn %2, v + mova [tmpq+stride3q], m12 ; p0 +%endif + + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m3, m14 + punpckhbw m11, m3, m14 + pmaddubsw m14, m8, [pb_m1_1] + pmaddubsw m13, m11, [pb_m1_1] + paddw m0, m14 + paddw m1, m13 + pmulhrsw m14, m0, [pw_4096] + pmulhrsw m13, m1, [pw_4096] + packuswb m14, m13 + vpblendvb m14, m5, m14, m9 +%ifidn %2, v + mova [dstq+strideq*0], m14 ; q0 +%endif + + pmaddubsw m8, [pb_m1_2] + pmaddubsw m11, [pb_m1_2] + paddw m0, m8 + paddw m1, m11 + pmaddubsw m7, [pb_m1_0] + pmaddubsw m10, [pb_m1_0] + paddw m0, m7 + paddw m1, m10 + pmulhrsw m0, [pw_4096] + pmulhrsw m1, [pw_4096] + packuswb m0, m1 + vpblendvb m0, m6, m0, m9 +%ifidn %2, v + mova [dstq+strideq*1], m0 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 +%endif +%else +%ifidn %2, v + mova [tmpq+strideq*0], m3 ; p1 + mova [tmpq+strideq*1], m4 ; p0 + mova [tmpq+strideq*2], m5 ; q0 + mova [tmpq+stride3q ], m6 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 +%endif +%endif +%endmacro + +INIT_YMM avx2 +cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + +.loop: + cmp byte [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, v + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .end + + call .v4 + +.end: + add lq, 32 + add dstq, 32 + add maskq, 1 + sub wd, 8 + jg .loop + RET +ALIGN function_align +.v4: + FILTER 4, v + ret + +INIT_YMM avx2 +cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + +.loop: + cmp byte [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, h + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .no_filter + + call .h4 + jmp .end + +.no_filter: + lea dstq, [dstq+stride3q*8] + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+strideq*8] +.end: + add maskq, 1 + sub hd, 8 + jg .loop + RET +ALIGN function_align +.h4: + FILTER 4, h + ret + +INIT_YMM avx2 +cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + +.loop: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, v + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .end + + call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4 + +.end: + add lq, 32 + add dstq, 32 + add maskq, 1 + sub wd, 8 + jg .loop + RET + +INIT_YMM avx2 +cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] + +.loop: + cmp byte [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, h + jmp .end + +.no_flat: + cmp byte [maskq+0], 0 ; vmask[0] + je .no_filter + + call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4 + jmp .end + +.no_filter: + lea dstq, [dstq+stride3q*8] + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+strideq*8] +.end: + add maskq, 1 + sub hd, 8 + jg .loop + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/loopfilter_avx512.asm b/third_party/dav1d/src/x86/loopfilter_avx512.asm new file mode 100644 index 0000000000..0218b624d3 --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter_avx512.asm @@ -0,0 +1,1534 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 + +pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080 + dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 + +hmulA: dd 0, 8, 16, 24, 32, 40, 48, 56, 4, 12, 20, 28, 36, 44, 52, 60 +hmulB: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51 +hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49 +hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + +pb_1: times 4 db 1 +pb_2: times 4 db 2 +pb_3: times 4 db 3 +pb_4: times 4 db 4 +pb_16: times 4 db 16 +pb_63: times 4 db 63 +pb_64: times 4 db 64 +pb_128: times 4 db 0x80 +pb_240: times 4 db 0xf0 +pb_248: times 4 db 0xf8 +pb_254: times 4 db 0xfe +pb_2_1: times 2 db 2, 1 +pb_3_1: times 2 db 3, 1 +pb_7_1: times 2 db 7, 1 +pb_m1_0: times 2 db -1, 0 +pb_m1_1: times 2 db -1, 1 +pb_m1_2: times 2 db -1, 2 +pw_2048: times 2 dw 2048 +pw_4096: times 2 dw 4096 + +SECTION .text + +%macro ABSSUB 4 ; dst, a, b, tmp + psubusb %1, %2, %3 + psubusb %4, %3, %2 + por %1, %4 +%endmacro + +%macro TRANSPOSE_16x4_AND_WRITE_4x32 5 + punpcklbw m%5, m%1, m%2 + punpckhbw m%1, m%2 + punpcklbw m%2, m%3, m%4 + punpckhbw m%3, m%4 + punpcklwd m%4, m%5, m%2 + punpckhwd m%5, m%2 + punpcklwd m%2, m%1, m%3 + punpckhwd m%1, m%3 + kmovw k1, k6 + lea t0, [dstq+strideq*4] + vpscatterdd [dstq+m19-2]{k1}, m%4 + kmovw k1, k6 + lea t1, [dstq+strideq*8] + vpscatterdd [t0 +m19-2]{k1}, m%5 + kmovw k1, k6 + lea t2, [t0 +strideq*8] + vpscatterdd [t1 +m19-2]{k1}, m%2 + kmovw k1, k6 + vpscatterdd [t2 +m19-2]{k1}, m%1 +%endmacro + +%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem +%if %1 == 0 + SWAP m16, m22 +%endif + punpcklbw m22, m24, m26 + punpckhbw m24, m26 + punpcklbw m26, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 + punpcklbw m5, m6, m7 + punpckhbw m6, m7 + punpcklbw m7, m8, m9 + punpckhbw m8, m9 + punpcklbw m9, m10, m11 + punpckhbw m10, m11 + punpcklbw m11, m25, m13 + punpckhbw m25, m13 +%if %1 == 0 + SWAP m13, m16 +%else + mova m13, %3 +%endif + SWAP m16, m25 + punpcklbw m25, m14, m13 + punpckhbw m13, m14, m13 + ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13 + punpcklwd m14, m22, m26 + punpckhwd m22, m26 + punpcklwd m26, m24, m2 + punpckhwd m24, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m7, m9 + punpckhwd m7, m9 + punpcklwd m9, m8, m10 + punpckhwd m8, m10 + punpcklwd m10, m11, m25 + punpckhwd m11, m25 + SWAP m25, m16, m11 + punpcklwd m11, m25, m13 + punpckhwd m25, m13 + ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25 + punpckldq m13, m14, m2 + punpckhdq m14, m2 + punpckldq m2, m22, m3 + punpckhdq m22, m3 + punpckldq m3, m26, m5 + punpckhdq m26, m5 + punpckldq m5, m24, m4 + punpckhdq m24, m4 + punpckldq m4, m6, m10 + punpckhdq m6, m10 + punpckldq m10, m9, m11 + punpckhdq m9, m11 + punpckldq m11, m8, m25 + punpckhdq m8, m25 + SWAP m25, m16, m8 + punpckldq m8, m7, m25 + punpckhdq m7, m25 + ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3 + punpcklqdq m25, m13, m4 + punpckhqdq m13, m4 + punpcklqdq m4, m14, m6 + punpckhqdq m14, m6 + punpcklqdq m6, m2, m8 + punpckhqdq m2, m8 + punpcklqdq m8, m22, m7 + punpckhqdq m22, m7 + punpcklqdq m7, m3, m10 + punpckhqdq m3, m10 + punpcklqdq m10, m26, m9 + punpckhqdq m26, m9 + punpcklqdq m9, m5, m11 + punpckhqdq m5, m11 + SWAP m11, m16 +%if %2 == 0 + SWAP m16, m25 +%else + mova %3, m25 +%endif + punpcklqdq m25, m24, m11 + punpckhqdq m24, m11 +%if %2 == 0 + SWAP m11, m16 +%endif + ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24 + SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22 + SWAP 3, 14, 25, 9 +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] + ; load data +%ifidn %2, v +%define is_h 0 +%if %1 == 4 + lea t0, [dstq+mstrideq*2] + mova m3, [t0 +strideq*0] ; p1 + mova m4, [t0 +strideq*1] ; p0 + mova m5, [t0 +strideq*2] ; q0 + mova m6, [t0 +stride3q ] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline +%if %1 == 16 + lea t0, [dstq+mstrideq*8] + mova m16, [t0 +strideq*1] + mova m17, [t0 +strideq*2] + mova m18, [t0 +stride3q ] +%endif + lea t0, [dstq+mstrideq*4] +%if %1 != 6 + mova m25, [t0 +strideq*0] +%endif + mova m13, [t0 +strideq*1] + mova m3, [t0 +strideq*2] + mova m4, [t0 +stride3q ] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m22, [dstq+stride3q ] +%endif +%if %1 == 16 + lea t0, [dstq+strideq*4] + mova m29, [t0 +strideq*0] + mova m30, [t0 +strideq*1] + mova m31, [t0 +strideq*2] +%endif +%endif +%else ; h +%define is_h 1 + ; load lines +%if %1 == 4 + vbroadcasti32x4 m0, [hshuf4] + kmovw k1, k6 + lea t0, [dstq+strideq*4] + vpgatherdd m3{k1}, [dstq+m19-2] + kmovw k1, k6 + lea t1, [dstq+strideq*8] + vpgatherdd m4{k1}, [t0 +m19-2] + kmovw k1, k6 + lea t2, [t0 +strideq*8] + vpgatherdd m5{k1}, [t1 +m19-2] + kmovw k1, k6 + vpgatherdd m6{k1}, [t2 +m19-2] + pshufb m3, m0 + pshufb m4, m0 + pshufb m5, m0 + pshufb m6, m0 + punpckldq m7, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpcklqdq m6, m7, m4 + punpckhqdq m7, m4 + punpcklqdq m4, m3, m5 + punpckhqdq m3, m5 + SWAP 3, 6 + SWAP 5, 4, 7 + ; 6,7,4,3 -> 3,4,5,6 +%elif %1 == 6 || %1 == 8 + kmovb k1, k7 + lea t0, [dstq+strideq*1] + vpgatherdq m3{k1}, [dstq+ym21-%1/2] + kmovb k1, k7 + lea t1, [dstq+strideq*2] + vpgatherdq m4{k1}, [t0 +ym21-%1/2] + kmovb k1, k7 + lea t2, [dstq+stride3q ] + vpgatherdq m5{k1}, [t1 +ym21-%1/2] + kmovb k1, k7 + vextracti32x8 ym0, m21, 1 + vpgatherdq m6{k1}, [t2 +ym21-%1/2] + kmovb k1, k7 + vpgatherdq m12{k1}, [dstq+ym0 -%1/2] + kmovb k1, k7 + vpgatherdq m13{k1}, [t0 +ym0 -%1/2] + kmovb k1, k7 + vpgatherdq m14{k1}, [t1 +ym0 -%1/2] + kmovb k1, k7 + vpgatherdq m15{k1}, [t2 +ym0 -%1/2] + ; transpose 8x16 + ; xm3: A-H0,A-H8 + ; xm4: A-H1,A-H9 + ; xm5: A-H2,A-H10 + ; xm6: A-H3,A-H11 + ; xm12: A-H4,A-H12 + ; xm13: A-H5,A-H13 + ; xm14: A-H6,A-H14 + ; xm15: A-H7,A-H15 + punpcklbw m7, m3, m4 + punpckhbw m3, m4 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + punpcklbw m6, m12, m13 + punpckhbw m12, m13 + punpcklbw m13, m14, m15 + punpckhbw m14, m15 + ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 + ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 + ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 + ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 + ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 + ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 + ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 + ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 + punpcklwd m15, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m6, m13 + punpckhwd m6, m13 + punpcklwd m13, m12, m14 + punpckhwd m12, m14 + ; xm15: A0-3,B0-3,C0-3,D0-3 + ; xm7: E0-3,F0-3,G0-3,H0-3 + ; xm4: A8-11,B8-11,C8-11,D8-11 + ; xm3: E8-11,F8-11,G8-11,H8-11 + ; xm5: A4-7,B4-7,C4-7,D4-7 + ; xm6: E4-7,F4-7,G4-7,H4-7 + ; xm13: A12-15,B12-15,C12-15,D12-15 + ; xm12: E12-15,F12-15,G12-15,H12-15 + punpckldq m14, m15, m5 + punpckhdq m15, m5 + punpckldq m5, m7, m6 + %if %1 != 6 + punpckhdq m7, m6 + %endif + punpckldq m6, m4, m13 + punpckhdq m4, m13 + punpckldq m13, m3, m12 + %if %1 != 6 + punpckhdq m12, m3, m12 + %endif + ; xm14: A0-7,B0-7 + ; xm15: C0-7,D0-7 + ; xm5: E0-7,F0-7 + ; xm7: G0-7,H0-7 + ; xm6: A8-15,B8-15 + ; xm4: C8-15,D8-15 + ; xm13: E8-15,F8-15 + ; xm12: G8-15,H8-15 + punpcklqdq m3, m14, m6 + punpckhqdq m14, m6 + punpckhqdq m6, m15, m4 + punpcklqdq m15, m4 + punpcklqdq m4, m5, m13 + punpckhqdq m13, m5, m13 + %if %1 == 8 + punpcklqdq m5, m7, m12 + punpckhqdq m25, m7, m12 + ; xm3: A0-15 + ; xm14: B0-15 + ; xm15: C0-15 + ; xm6: D0-15 + ; xm4: E0-15 + ; xm13: F0-15 + ; xm5: G0-15 + ; xm25: H0-15 + SWAP 25, 3, 15 + SWAP 13, 14, 5, 4, 6 + SWAP 15, 22 + ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22 + %else + SWAP 13, 3, 14 + SWAP 6, 4, 15, 5 + ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 + %endif +%else ; 16, h + ; load and 16x16 transpose. We only use 14 pixels but we'll need the + ; remainder at the end for the second transpose + movu xm24, [dstq+strideq*0-8] + movu xm26, [dstq+strideq*1-8] + movu xm2, [dstq+strideq*2-8] + movu xm3, [dstq+stride3q -8] + lea t0, [dstq+strideq*4] + movu xm4, [t0 +strideq*0-8] + movu xm5, [t0 +strideq*1-8] + movu xm6, [t0 +strideq*2-8] + movu xm7, [t0 +stride3q -8] + lea t0, [t0 +strideq*4] + movu xm8, [t0 +strideq*0-8] + movu xm9, [t0 +strideq*1-8] + movu xm10, [t0 +strideq*2-8] + movu xm11, [t0 +stride3q -8] + lea t0, [t0 +strideq*4] + movu xm25, [t0 +strideq*0-8] + movu xm13, [t0 +strideq*1-8] + movu xm14, [t0 +strideq*2-8] + movu xm22, [t0 +stride3q -8] + lea t0, [t0 +strideq*4] + vinserti32x4 ym24, [t0 +strideq*0-8], 1 + vinserti32x4 ym26, [t0 +strideq*1-8], 1 + vinserti32x4 ym2, [t0 +strideq*2-8], 1 + vinserti32x4 ym3, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 ym4, [t0 +strideq*0-8], 1 + vinserti32x4 ym5, [t0 +strideq*1-8], 1 + vinserti32x4 ym6, [t0 +strideq*2-8], 1 + vinserti32x4 ym7, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 ym8, [t0 +strideq*0-8], 1 + vinserti32x4 ym9, [t0 +strideq*1-8], 1 + vinserti32x4 ym10, [t0 +strideq*2-8], 1 + vinserti32x4 ym11, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 ym25, [t0 +strideq*0-8], 1 + vinserti32x4 ym13, [t0 +strideq*1-8], 1 + vinserti32x4 ym14, [t0 +strideq*2-8], 1 + vinserti32x4 ym22, [t0 +stride3q -8], 1 + lea t0, [t0 +strideq*4] + vinserti32x4 m24, [t0 +strideq*0-8], 2 + vinserti32x4 m26, [t0 +strideq*1-8], 2 + vinserti32x4 m2, [t0 +strideq*2-8], 2 + vinserti32x4 m3, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m4, [t0 +strideq*0-8], 2 + vinserti32x4 m5, [t0 +strideq*1-8], 2 + vinserti32x4 m6, [t0 +strideq*2-8], 2 + vinserti32x4 m7, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m8, [t0 +strideq*0-8], 2 + vinserti32x4 m9, [t0 +strideq*1-8], 2 + vinserti32x4 m10, [t0 +strideq*2-8], 2 + vinserti32x4 m11, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m25, [t0 +strideq*0-8], 2 + vinserti32x4 m13, [t0 +strideq*1-8], 2 + vinserti32x4 m14, [t0 +strideq*2-8], 2 + vinserti32x4 m22, [t0 +stride3q -8], 2 + lea t0, [t0 +strideq*4] + vinserti32x4 m24, [t0 +strideq*0-8], 3 + vinserti32x4 m26, [t0 +strideq*1-8], 3 + vinserti32x4 m2, [t0 +strideq*2-8], 3 + vinserti32x4 m3, [t0 +stride3q -8], 3 + lea t0, [t0 +strideq*4] + vinserti32x4 m4, [t0 +strideq*0-8], 3 + vinserti32x4 m5, [t0 +strideq*1-8], 3 + vinserti32x4 m6, [t0 +strideq*2-8], 3 + vinserti32x4 m7, [t0 +stride3q -8], 3 + lea t0, [t0 +strideq*4] + vinserti32x4 m8, [t0 +strideq*0-8], 3 + vinserti32x4 m9, [t0 +strideq*1-8], 3 + vinserti32x4 m10, [t0 +strideq*2-8], 3 + vinserti32x4 m11, [t0 +stride3q -8], 3 + lea t0, [t0 +strideq*4] + vinserti32x4 m25, [t0 +strideq*0-8], 3 + vinserti32x4 m13, [t0 +strideq*1-8], 3 + vinserti32x4 m14, [t0 +strideq*2-8], 3 + vinserti32x4 m22, [t0 +stride3q -8], 3 + ; + TRANSPOSE_16X16B 0, 1, [rsp+0*64] + SWAP m16, m26 + SWAP m17, m2 + SWAP m18, m3 + SWAP m29, m25 + SWAP m30, m13 + SWAP m31, m14 + mova [rsp+4*64], m22 + ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22 + SWAP 25, 4, 7 + SWAP 13, 5, 8 + SWAP 3, 6, 9 + SWAP 10, 14 + SWAP 11, 22 +%endif +%endif + + ; load L/E/I/H + vpbroadcastd m15, [pb_1] +%ifidn %2, v + movu m1, [lq] + movu m0, [lq+l_strideq] +%else + kmovw k1, k6 + vpgatherdd m0{k1}, [lq+m20+4] + kmovw k1, k6 + vpgatherdd m1{k1}, [lq+m20+0] +%endif + pxor m2, m2 + pcmpeqb k1, m0, m2 + vmovdqu8 m0{k1}, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, pbshuf ; l[x][0] + vpcmpub k3, m0, m2, 4 ; neq ; L + psrlq m2, m0, [lutq+128] + pand m2, [pb_63]{bcstd} + vpbroadcastb m1, [lutq+136] + pminub m2, m1 + pmaxub m2, m15 ; I + pand m1, m0, [pb_240]{bcstd} + psrlq m1, 4 ; H + paddd m0, [pb_2]{bcstd} + paddb m0, m0 + paddb m0, m2 ; E + + ABSSUB m8, m3, m4, m9 ; abs(p1-p0) + ABSSUB m9, m5, m6, m10 ; abs(q1-q0) + pmaxub m8, m9 + vpcmpub k1, m8, m1, 6 ; gt ; hev +%if %1 != 4 + %if %1 == 6 + ABSSUB m9, m13, m4, m10 ; abs(p2-p0) + pmaxub m9, m8 + %else + ABSSUB m9, m25, m4, m10 ; abs(p3-p0) + pmaxub m9, m8 + ABSSUB m10, m13, m4, m11 ; abs(p2-p0) + pmaxub m9, m10 + %endif + ABSSUB m10, m5, m14, m11 ; abs(q2-q0) + pmaxub m9, m10 + %if %1 != 6 + ABSSUB m10, m5, m22, m11 ; abs(q3-q0) + pmaxub m9, m10 + %endif + vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in + %if %1 == 6 + ABSSUB m10, m13, m3, m1 ; abs(p2-p1) + %else + ABSSUB m10, m25, m13, m11 ; abs(p3-p2) + ABSSUB m11, m13, m3, m1 ; abs(p2-p1) + pmaxub m10, m11 + ABSSUB m11, m14, m22, m1 ; abs(q3-q2) + pmaxub m10, m11 + %endif + ABSSUB m11, m14, m6, m1 ; abs(q2-q1) + pmaxub m10, m11 + %if %1 == 16 + vpbroadcastd m11, [maskq+8] + por m11, [maskq+4]{bcstd} + %else + vpbroadcastd m11, [maskq+4] + %endif + vptestmd k4, m11, pbmask + vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks + pmaxub m8, m10 +%endif + vpcmpub k3{k3}, m8, m2, 2 ; le + ABSSUB m10, m3, m6, m11 ; abs(p1-q1) + ABSSUB m11, m4, m5, m2 ; abs(p0-q0) + paddusb m11, m11 + pand m10, [pb_254]{bcstd} + psrlq m10, 1 + paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E + +%if %1 == 16 + ABSSUB m1, m16, m4, m2 + ABSSUB m2, m17, m4, m10 + pmaxub m1, m2 + ABSSUB m2, m18, m4, m10 + pmaxub m1, m2 + ABSSUB m2, m29, m5, m10 + pmaxub m1, m2 + ABSSUB m2, m30, m5, m10 + pmaxub m1, m2 + ABSSUB m2, m31, m5, m10 + pmaxub m1, m2 + kandq k2, k2, k3 + vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out + vpbroadcastd m2, [maskq+8] + vptestmd k5, m2, pbmask + vpmovm2d m7, k5 + vptestmb k4{k4}, m7, m7 ; flat16 & fm + por m10, m2, [maskq+4]{bcstd} + vptestmd k5, m10, pbmask + vpmovm2d m7, k5 + vptestmb k2{k2}, m7, m7 ; flat8in + por m2, m10, [maskq+0]{bcstd} + vptestmd k5, m2, pbmask + vpmovm2d m7, k5 + vptestmb k3{k3}, m7, m7 + kandnq k3, k2, k3 ; fm & !flat8 & !flat16 + kandnq k2, k4, k2 ; flat8 & !flat16 +%elif %1 != 4 + vpbroadcastd m0, [maskq+4] + vptestmd k4, m0, pbmask + vpmovm2d m7, k4 + vptestmb k2{k2}, m7, m7 + kandq k2, k2, k3 ; flat8 & fm + por m0, [maskq+0]{bcstd} + vptestmd k4, m0, pbmask + vpmovm2d m7, k4 + vptestmb k3{k3}, m7, m7 + kandnq k3, k2, k3 ; fm & !flat8 +%else + %ifidn %2, v + vptestmd k4, pbmask, [maskq+0]{bcstd} + %else + vpbroadcastd m0, [maskq+0] + vptestmd k4, m0, pbmask + %endif + vpmovm2d m7, k4 + vptestmb k3{k3}, m7, m7 ; fm +%endif + + ; short filter +%if %1 >= 8 + SWAP m23, m15 +%endif + vpbroadcastd m15, [pb_3] + vpbroadcastd m0, [pb_4] + vpbroadcastd m12, [pb_16] + vpbroadcastd m1, [pb_64] + pxor m3, pb128 + pxor m6, pb128 + psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev + pxor m4, pb128 + pxor m5, pb128 + psubsb m11, m5, m4 + paddsb m10, m11 + paddsb m10, m11 + paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm + paddsb m8, m10, m15 + paddsb m10, m0 + pand m8, [pb_248]{bcstd} + pand m10, [pb_248]{bcstd} + psrlq m8, 3 + psrlq m10, 3 + pxor m8, m12 + pxor m10, m12 + psubb m8, m12 ; f2 + psubb m10, m12 ; f1 + paddsb m4, m8 + psubsb m5, m10 + pxor m4, pb128 + pxor m5, pb128 + ; + pxor m10, pb128 + pxor m8, m8 + pavgb m8, m10 ; f=(f1+1)>>1 + psubb m8, m1 + knotq k1, k1 + paddsb m3{k1}, m3, m8 + psubsb m6{k1}, m6, m8 + pxor m3, pb128 + pxor m6, pb128 + +%if %1 == 16 + ; flat16 filter +%ifidn %2, v + lea t0, [dstq+mstrideq*8] +%endif + SWAP m24, m16, m14 + SWAP m2, m17, m22 + SWAP m7, m18 + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A + ; write -6 + vpbroadcastd m1, [pb_7_1] + vpbroadcastd m12, [pb_2] + punpcklbw m14, m24, m25 + punpckhbw m22, m24, m25 + pmaddubsw m10, m14, m1 + pmaddubsw m11, m22, m1 ; p6*7+p3 + punpcklbw m8, m2, m7 + punpckhbw m9, m2, m7 + pmaddubsw m8, m12 + pmaddubsw m9, m12 + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3 +%ifidn %2, h + vpbroadcastd m27, [pw_2048] + vpbroadcastd m1, [pb_m1_1] + %define pw2048 m27 + %define pbm1_1 m1 +%endif + punpcklbw m8, m13, m3 + punpckhbw m9, m13, m3 + pmaddubsw m8, m23 + pmaddubsw m9, m23 + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 + punpcklbw m8, m4, m5 + punpckhbw m9, m4, m5 + pmaddubsw m8, m23 + pmaddubsw m9, m23 + paddw m10, m8 + paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+strideq*2]{k4}, m8 ; p5 +%else + vpblendmb m8{k4}, m2, m8 + mova [rsp+1*64], m8 +%endif + + ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B + ; write -5 + pmaddubsw m14, pbm1_1 + pmaddubsw m22, pbm1_1 + paddw m10, m14 + paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m8, m24, m6 + punpckhbw m9, m24, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 + SWAP m18, m8 + SWAP m23, m9 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+stride3q]{k4}, m8 ; p4 +%else + vpblendmb m8{k4}, m7, m8 + mova [rsp+2*64], m8 +%endif + + ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C + ; write -4 + SWAP m14, m16 + punpcklbw m8, m24, m13 + punpckhbw m9, m24, m13 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 + punpcklbw m8, m2, m14 + punpckhbw m2, m14 + pmaddubsw m8, pbm1_1 + pmaddubsw m2, pbm1_1 + paddw m10, m8 + paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 + SWAP m16, m8 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3 +%else + vpblendmb m8{k4}, m25, m8 + mova [rsp+3*64], m8 +%endif + + ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D + ; write -3 + SWAP m22, m17 + punpcklbw m8, m24, m3 + punpckhbw m9, m24, m3 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 + punpcklbw m8, m7, m22 + punpckhbw m7, m22 + pmaddubsw m8, pbm1_1 + pmaddubsw m7, pbm1_1 + paddw m10, m8 + paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 + SWAP m17, m8 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 + vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F + + ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E + ; write -2 +%ifidn %2, v + lea t0, [dstq+strideq*4] +%endif + punpcklbw m8, m24, m4 + punpckhbw m9, m24, m4 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 + punpcklbw m8, m25, m29 + punpckhbw m9, m25, m29 + SWAP m26, m29 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 + SWAP m29, m8 + SWAP m0, m9 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 + vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G + + ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F + ; write -1 +%ifidn %2, h + SWAP m28, m24 + punpcklbw m8, m28, m5 + punpckhbw m24, m28, m5 +%else + punpcklbw m8, m24, m5 + punpckhbw m24, m5 +%endif + pmaddubsw m8, pbm1_1 + pmaddubsw m24, pbm1_1 + paddw m10, m8 + paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m24, m13, m30 + punpckhbw m9, m13, m30 +%ifidn %2, h + SWAP m27, m30 +%endif + SWAP m13, m15 + pmaddubsw m24, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m24 + paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 + SWAP m30, m24 + SWAP m15, m9 +%ifidn %2, h + SWAP m9, m24 + %define pw2048 m9 +%endif + pmulhrsw m24, m10, pw2048 + pmulhrsw m8, m11, pw2048 + paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 + paddw m11, m23 + packuswb m24, m8 + punpcklbw m8, m3, m31 + pmaddubsw m8, pbm1_1 + paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + SWAP m18, m8 + pmulhrsw m8, m10, pw2048 + paddw m10, m16 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 +%ifidn %2, h + SWAP m16, m9 + %define pw2048 m16 +%endif + punpckhbw m9, m3, m31 + SWAP m3, m12 + pmaddubsw m9, pbm1_1 + paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + SWAP m23, m9 + pmulhrsw m9, m11, pw2048 + paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 +%ifidn %2, h + SWAP m2, m1 + %define pbm1_1 m2 +%endif + vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H + + ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G + ; write +0 + SWAP m24, m31 ; q6 + packuswb m8, m9 +%ifidn %2, h + SWAP m31, m2 + %define pbm1_1 m31 +%endif + vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I + + ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H + ; write +1 + punpcklbw m8, m4, m24 + punpckhbw m2, m4, m24 + SWAP m4, m1 + pmaddubsw m8, pbm1_1 + pmaddubsw m2, pbm1_1 + paddw m10, m8 + paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 + pmulhrsw m2, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m2, m9 + vpblendmb m2{k4}, m6, m2 ; don't clobber q1/m6 since we need it in K + + ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I + ; write +2 + paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + paddw m11, m7 + punpcklbw m8, m5, m24 + punpckhbw m9, m5, m24 + SWAP m5, m12 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 + pmulhrsw m7, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m7, m9 + vpblendmb m7{k4}, m14, m7 ; don't clobber q2/m14 since we need it in K + + ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J + ; write +3 + paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + paddw m11, m0 + punpcklbw m8, m6, m24 + punpckhbw m9, m6, m24 + SWAP 2, 6 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+mstrideq]{k4}, m8 +%else + SWAP m29, m16 + %define pw2048 m29 + vpblendmb m16{k4}, m22, m8 +%endif + + ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K + ; write +4 + paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + paddw m11, m15 +%ifidn %2, h + SWAP m15, m8 +%endif + punpcklbw m8, m14, m24 + punpckhbw m9, m14, m24 + SWAP 14, 7 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m8, m10, pw2048 + pmulhrsw m9, m11, pw2048 + packuswb m8, m9 +%ifidn %2, v + vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4 +%else + vpblendmb m17{k4}, m26, m8 +%endif + + ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L + ; write +5 + paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + paddw m11, m23 + punpcklbw m8, m22, m24 + punpckhbw m9, m22, m24 + SWAP m30, m24 + pmaddubsw m8, pbm1_1 + pmaddubsw m9, pbm1_1 + paddw m10, m8 + paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m10, pw2048 + pmulhrsw m11, pw2048 + packuswb m10, m11 +%ifidn %2, v + vmovdqu8 [t0+strideq*1]{k4}, m10 ; q5 +%else + vmovdqu8 m27{k4}, m10 +%endif + +%ifidn %2, v + lea t0, [dstq+mstrideq*4] +%endif +%endif + +%if %1 >= 8 + ; flat8 filter + vpbroadcastd m9, [pb_3_1] + vpbroadcastd m10, [pb_2_1] +%if %1 == 16 + vpbroadcastd m23, [pb_1] + vpbroadcastd m0, [pb_4] +%elifidn %2, h + vpbroadcastd m31, [pb_m1_1] + %define pbm1_1 m31 +%endif + punpcklbw m24, m25, m3 + punpckhbw m26, m25, m3 + pmaddubsw m2, m24, m9 + pmaddubsw m7, m26, m9 ; 3 * p3 + p1 + punpcklbw m8, m13, m4 + punpckhbw m11, m13, m4 + pmaddubsw m8, m10 + pmaddubsw m11, m10 + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + punpcklbw m8, m5, m0 + punpckhbw m11, m5, m0 + pmaddubsw m8, m23 + pmaddubsw m11, m23 + paddw m2, m8 + paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 +%if is_h || %1 == 16 + vpblendmb m10{k2}, m13, m8 ; p2 +%endif +%ifidn %2, v + %if %1 == 8 + vmovdqu8 [t0+strideq*1]{k2}, m8 + %else + mova [t0+strideq*1], m10 + %endif +%endif + + pmaddubsw m8, m24, pbm1_1 + pmaddubsw m11, m26, pbm1_1 + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m11, pbm1_1 + paddw m2, m8 + paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendmb m8{k2}, m3, m8 ; p1 +%ifidn %2, v + mova [t0+strideq*2], m8 +%else + SWAP m18, m8 +%endif + + pmaddubsw m24, m23 + pmaddubsw m26, m23 + psubw m2, m24 + psubw m7, m26 + punpcklbw m8, m4, m14 + punpckhbw m11, m4, m14 + pmaddubsw m8, m23 + pmaddubsw m11, m23 + paddw m2, m8 + paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendmb m8{k2}, m4, m8 ; p0 +%ifidn %2, v + mova [t0+stride3q], m8 +%else + SWAP m29, m8 +%endif + + punpcklbw m24, m5, m22 + punpckhbw m26, m5, m22 + pmaddubsw m8, m24, m23 + pmaddubsw m11, m26, m23 + paddw m2, m8 + paddw m7, m11 + punpcklbw m8, m4, m25 + punpckhbw m11, m4, m25 + pmaddubsw m8, m23 + pmaddubsw m11, m23 + psubw m2, m8 + psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 + psrlw m8, m2, 3 + psrlw m11, m7, 3 + packuswb m8, m11 + vpblendmb m11{k2}, m5, m8 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m11 +%endif + + pmaddubsw m24, pbm1_1 + pmaddubsw m26, pbm1_1 + paddw m2, m24 + paddw m7, m26 + punpcklbw m8, m13, m6 + punpckhbw m13, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m13, pbm1_1 + paddw m2, m8 + paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 + psrlw m8, m2, 3 + psrlw m13, m7, 3 + packuswb m8, m13 + vpblendmb m13{k2}, m6, m8 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m13 +%endif + + punpcklbw m24, m3, m6 + punpckhbw m26, m3, m6 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + psubw m2, m24 + psubw m7, m26 + punpcklbw m24, m14, m22 + punpckhbw m26, m14, m22 + pmaddubsw m24, m23 + pmaddubsw m26, m23 + paddw m2, m24 + paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + psrlw m2, 3 + psrlw m7, 3 + packuswb m2, m7 +%if is_h || %1 == 16 + vpblendmb m2{k2}, m14, m2 ; q2 +%endif +%ifidn %2, v + %if %1 == 8 + vmovdqu8 [dstq+strideq*2]{k2}, m2 + %else + mova [dstq+strideq*2], m2 + %endif +%endif + +%ifidn %2, h + SWAP m24, m18 + SWAP m26, m29 +%if %1 == 8 + ; 16x8 transpose + punpcklbw m3, m25, m10 + punpckhbw m25, m10 + punpcklbw m10, m24, m26 + punpckhbw m24, m26 + punpcklbw m26, m11, m13 + punpckhbw m11, m13 + punpcklbw m13, m2, m22 + punpckhbw m2, m22 + ; + punpcklwd m22, m3, m10 + punpckhwd m3, m10 + punpcklwd m10, m25, m24 + punpckhwd m25, m24 + punpcklwd m24, m26, m13 + punpckhwd m26, m13 + punpcklwd m13, m11, m2 + punpckhwd m11, m2 + ; + punpckldq m2, m22, m24 + punpckhdq m22, m24 + punpckldq m24, m3, m26 + punpckhdq m3, m26 + punpckldq m26, m10, m13 + punpckhdq m10, m13 + punpckldq m13, m25, m11 + punpckhdq m25, m11 + ; write 8x32 + vpbroadcastd ym16, strided + pmulld ym16, [hmulD] + lea t1, [dstq+strideq*2] + lea t2, [dstq+strideq*4] + lea t3, [t1 +strideq*4] + lea t0, [dstq+strideq*8] + kmovb k1, k6 + kmovb k2, k6 + kmovb k3, k6 + kmovb k4, k6 + vpscatterdq [dstq+ym16-4]{k1}, m2 + vpscatterdq [t1 +ym16-4]{k2}, m22 + vpscatterdq [t2 +ym16-4]{k3}, m24 + vpscatterdq [t3 +ym16-4]{k4}, m3 + lea t1, [t0+strideq*2] + lea t2, [t0+strideq*4] + lea t3, [t1+strideq*4] + kmovb k1, k6 + kmovb k2, k6 + kmovb k3, k6 + kmovb k4, k6 + vpscatterdq [t0+ym16-4]{k1}, m26 + vpscatterdq [t1+ym16-4]{k2}, m10 + vpscatterdq [t2+ym16-4]{k3}, m13 + vpscatterdq [t3+ym16-4]{k4}, m25 +%else + ; 16x16 transpose and store + SWAP 5, 10, 2 + SWAP 6, 24 + SWAP 7, 26 + SWAP 8, 11 + SWAP 9, 13 + mova m24, [rsp+0*64] + SWAP m26, m28 + mova m2, [rsp+1*64] + mova m3, [rsp+2*64] + mova m4, [rsp+3*64] + SWAP m11, m16 + SWAP m25, m17 + SWAP m13, m27 + SWAP m14, m30 + TRANSPOSE_16X16B 1, 0, [rsp+4*64] + movu [dstq+strideq*0-8], xm24 + movu [dstq+strideq*1-8], xm26 + movu [dstq+strideq*2-8], xm2 + movu [dstq+stride3q -8], xm3 + lea t0, [dstq+strideq*4] + movu [t0+strideq*0-8], xm4 + movu [t0+strideq*1-8], xm5 + movu [t0+strideq*2-8], xm6 + movu [t0+stride3q -8], xm7 + lea t0, [t0+strideq*4] + movu [t0+strideq*0-8], xm8 + movu [t0+strideq*1-8], xm9 + movu [t0+strideq*2-8], xm10 + movu [t0+stride3q -8], xm11 + lea t0, [t0+strideq*4] + movu [t0+strideq*0-8], xm25 + movu [t0+strideq*1-8], xm13 + movu [t0+strideq*2-8], xm14 + movu [t0+stride3q -8], xm22 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym24, 1 + vextracti128 [t0+strideq*1-8], ym26, 1 + vextracti128 [t0+strideq*2-8], ym2, 1 + vextracti128 [t0+stride3q -8], ym3, 1 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym4, 1 + vextracti128 [t0+strideq*1-8], ym5, 1 + vextracti128 [t0+strideq*2-8], ym6, 1 + vextracti128 [t0+stride3q -8], ym7, 1 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym8, 1 + vextracti128 [t0+strideq*1-8], ym9, 1 + vextracti128 [t0+strideq*2-8], ym10, 1 + vextracti128 [t0+stride3q -8], ym11, 1 + lea t0, [t0+strideq*4] + vextracti128 [t0+strideq*0-8], ym25, 1 + vextracti128 [t0+strideq*1-8], ym13, 1 + vextracti128 [t0+strideq*2-8], ym14, 1 + vextracti128 [t0+stride3q -8], ym22, 1 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m24, 2 + vextracti32x4 [t0+strideq*1-8], m26, 2 + vextracti32x4 [t0+strideq*2-8], m2, 2 + vextracti32x4 [t0+stride3q -8], m3, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m4, 2 + vextracti32x4 [t0+strideq*1-8], m5, 2 + vextracti32x4 [t0+strideq*2-8], m6, 2 + vextracti32x4 [t0+stride3q -8], m7, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m8, 2 + vextracti32x4 [t0+strideq*1-8], m9, 2 + vextracti32x4 [t0+strideq*2-8], m10, 2 + vextracti32x4 [t0+stride3q -8], m11, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m25, 2 + vextracti32x4 [t0+strideq*1-8], m13, 2 + vextracti32x4 [t0+strideq*2-8], m14, 2 + vextracti32x4 [t0+stride3q -8], m22, 2 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m24, 3 + vextracti32x4 [t0+strideq*1-8], m26, 3 + vextracti32x4 [t0+strideq*2-8], m2, 3 + vextracti32x4 [t0+stride3q -8], m3, 3 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m4, 3 + vextracti32x4 [t0+strideq*1-8], m5, 3 + vextracti32x4 [t0+strideq*2-8], m6, 3 + vextracti32x4 [t0+stride3q -8], m7, 3 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m8, 3 + vextracti32x4 [t0+strideq*1-8], m9, 3 + vextracti32x4 [t0+strideq*2-8], m10, 3 + vextracti32x4 [t0+stride3q -8], m11, 3 + lea t0, [t0+strideq*4] + vextracti32x4 [t0+strideq*0-8], m25, 3 + vextracti32x4 [t0+strideq*1-8], m13, 3 + vextracti32x4 [t0+strideq*2-8], m14, 3 + vextracti32x4 [t0+stride3q -8], m22, 3 +%endif +%endif + +%elif %1 == 6 + ; flat6 filter + vpbroadcastd m15, [pb_3_1] + vpbroadcastd m12, [pb_2] + punpcklbw m8, m13, m5 + punpckhbw m11, m13, m5 + pmaddubsw m0, m8, m15 + pmaddubsw m1, m11, m15 + punpcklbw m7, m4, m3 + punpckhbw m10, m4, m3 + pmaddubsw m2, m7, m12 + pmaddubsw m12, m10, m12 +%ifidn %2, h + vpbroadcastd m15, [pb_m1_1] + %define pbm1_1 m15 +%endif + paddw m0, m2 + paddw m1, m12 + pmulhrsw m2, m0, m16 + pmulhrsw m12, m1, m16 + packuswb m2, m12 + vpblendmb m2{k2}, m3, m2 ; p1 +%ifidn %2, v + mova [t0+strideq*2], m2 +%endif + + pmaddubsw m8, pbm1_1 + pmaddubsw m11, pbm1_1 + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 + pmaddubsw m8, pbm1_1 + pmaddubsw m11, pbm1_1 + paddw m0, m8 + paddw m1, m11 + pmulhrsw m12, m0, m16 + pmulhrsw m13, m1, m16 + packuswb m12, m13 + vpblendmb m12{k2}, m4, m12 ; p0 +%ifidn %2, v + mova [t0+stride3q], m12 +%endif + + vpbroadcastd m9, [pb_m1_2] + vpbroadcastd m4, [pb_m1_0] + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m3, m14 + punpckhbw m11, m3, m14 + pmaddubsw m14, m8, pbm1_1 + pmaddubsw m13, m11, pbm1_1 + paddw m0, m14 + paddw m1, m13 + pmulhrsw m14, m0, m16 + pmulhrsw m13, m1, m16 + packuswb m14, m13 + vpblendmb m14{k2}, m5, m14 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m14 +%endif + + pmaddubsw m8, m9 + pmaddubsw m11, m9 + paddw m0, m8 + paddw m1, m11 + pmaddubsw m7, m4 + pmaddubsw m10, m4 + paddw m0, m7 + paddw m1, m10 + pmulhrsw m0, m16 + pmulhrsw m1, m16 + packuswb m0, m1 + vpblendmb m0{k2}, m6, m0 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m0 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 +%endif +%else ; %1 == 4 +%ifidn %2, v + mova [t0+strideq*0], m3 ; p1 + mova [t0+strideq*1], m4 ; p0 + mova [t0+strideq*2], m5 ; q0 + mova [t0+stride3q ], m6 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 +%endif +%endif +%endmacro + +%define k7 k6 + +INIT_ZMM avx512icl +cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ + lut, w, stride3, mstride + DECLARE_REG_TMP 9 + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mova m21, [pb_4x0_4x4_4x8_4x12] + mova m20, [pb_mask] + vpbroadcastd m19, [pb_128] + vpbroadcastd m28, [pb_m1_1] + vpbroadcastd m27, [pw_2048] + %define pbshuf m21 + %define pbmask m20 + %define pb128 m19 + %define pbm1_1 m28 + %define pw2048 m27 + +.loop: + cmp word [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, v + jmp .end + +.no_flat16: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, v + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + call .v4 + +.end: + add lq, 64 + add dstq, 64 + add maskq, 2 + sub wd, 16 + jg .loop + RET +ALIGN function_align +RESET_MM_PERMUTATION +.v4: + FILTER 4, v + ret + +cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ + lut, h, stride3, stride8 + DECLARE_REG_TMP 9, 10, 11, 12 + shl l_strideq, 2 + sub lq, 4 + lea stride3q, [strideq*3] + lea stride8q, [strideq*8] + kxnorw k6, k6, k6 + vpbroadcastd m19, strided + vpbroadcastd m20, l_strided + pmulld m21, m19, [hmulA] + pmulld m20, [hmulB] + pmulld m19, [hmulC] + %define pbshuf [pb_4x0_4x4_4x8_4x12] + %define pbmask [pb_mask] + %define pb128 [pb_128]{bcstd} + shl l_strideq, 1 + +.loop: + cmp word [maskq+8], 0 ; vmask[2] + je .no_flat16 + + FILTER 16, h + jmp .end + +.no_flat16: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 8, h + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + call .h4 + +.end: + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+stride8q*8] + add maskq, 2 + sub hd, 16 + jg .loop + RET +ALIGN function_align +RESET_MM_PERMUTATION +.h4: + FILTER 4, h + ret + +cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \ + lut, w, stride3, mstride + DECLARE_REG_TMP 9 + shl l_strideq, 2 + sub lq, l_strideq + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] + mova m21, [pb_4x0_4x4_4x8_4x12] + mova m20, [pb_mask] + vpbroadcastd m19, [pb_128] + vpbroadcastd m17, [pb_m1_1] + vpbroadcastd m16, [pw_4096] + %define pbshuf m21 + %define pbmask m20 + %define pb128 m19 + %define pbm1_1 m17 + +.loop: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, v + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4 + +.end: + add lq, 64 + add dstq, 64 + add maskq, 2 + sub wd, 16 + jg .loop + RET + +%undef k7 +cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ + lut, h, stride3, stride8 + DECLARE_REG_TMP 9, 10, 11 + mov r7d, 0xffff + movzx r8d, r7b + cmp hd, 9 + cmovb r7d, r8d + kmovw k6, r7d ; h > 8 ? 0xffff : 0x00ff + shl l_strideq, 2 + sub lq, 4 + kshiftrw k7, k6, 4 ; h > 8 ? 0xff : 0xf0 + lea stride3q, [strideq*3] + lea stride8q, [strideq*8] + vpbroadcastd m19, strided + vpbroadcastd m20, l_strided + pmulld m21, m19, [hmulA] + pmulld m20, [hmulB] + pmulld m19, [hmulC] + mova m18, [pb_mask] + vpbroadcastd m17, [pb_128] + vpbroadcastd m16, [pw_4096] + %define pbshuf [pb_4x0_4x4_4x8_4x12] + %define pbmask m18 + %define pb128 m17 + add l_strideq, l_strideq + +.loop: + cmp word [maskq+4], 0 ; vmask[1] + je .no_flat + + FILTER 6, h + jmp .end + +.no_flat: + cmp word [maskq+0], 0 ; vmask[0] + je .end + + call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4 + +.end: + lea lq, [lq+l_strideq*8] + lea dstq, [dstq+stride8q*8] + add maskq, 2 + sub hd, 16 + jg .loop + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/loopfilter_sse.asm b/third_party/dav1d/src/x86/loopfilter_sse.asm new file mode 100644 index 0000000000..cd0eb54702 --- /dev/null +++ b/third_party/dav1d/src/x86/loopfilter_sse.asm @@ -0,0 +1,2348 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 +pb_7_1: times 8 db 7, 1 +pb_3_1: times 8 db 3, 1 +pb_2_1: times 8 db 2, 1 +pb_m1_0: times 8 db -1, 0 +pb_m1_1: times 8 db -1, 1 +pb_m1_2: times 8 db -1, 2 +pb_1: times 16 db 1 +pb_2: times 16 db 2 +pb_3: times 16 db 3 +pb_4: times 16 db 4 +pb_16: times 16 db 16 +pb_63: times 16 db 63 +pb_64: times 16 db 64 +pb_128: times 16 db 0x80 +pb_129: times 16 db 0x81 +pb_240: times 16 db 0xf0 +pb_248: times 16 db 0xf8 +pb_254: times 16 db 0xfe + +pw_2048: times 8 dw 2048 +pw_4096: times 8 dw 4096 + +pd_mask: dd 1, 2, 4, 8 + +SECTION .text + +%macro ABSSUB 4 ; dst, a, b, tmp + psubusb %1, %2, %3 + psubusb %4, %3, %2 + por %1, %4 +%endmacro + +%macro TRANSPOSE_16x4_AND_WRITE_4x16 5 + ; transpose 16x4 + punpcklbw m%5, m%1, m%2 + punpckhbw m%1, m%2 + punpcklbw m%2, m%3, m%4 + punpckhbw m%3, m%4 + punpcklwd m%4, m%5, m%2 + punpckhwd m%5, m%2 + punpcklwd m%2, m%1, m%3 + punpckhwd m%1, m%3 + + ; write out +%assign %%n 0 +%rep 4 + movd [dstq+strideq *0-2], xm%4 + movd [dstq+strideq *4-2], xm%5 + movd [dstq+strideq *8-2], xm%2 + movd [dstq+stride3q*4-2], xm%1 + add dstq, strideq +%if %%n < 3 + psrldq xm%4, 4 + psrldq xm%5, 4 + psrldq xm%2, 4 + psrldq xm%1, 4 +%endif +%assign %%n (%%n+1) +%endrep + lea dstq, [dstq+stride3q*4] +%endmacro + +%macro TRANSPOSE_16X16B 2 ; output_transpose, mem +%if %1 == 0 + mova %2, m15 ; m7 in 32-bit +%endif + + ; input in m0-7 + punpcklbw m15, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + punpcklbw m3, m4, m5 + punpckhbw m4, m5 +%if ARCH_X86_64 + SWAP 4, 5, 7 +%else + %if %1 == 0 + mova m5, %2 + %else + mova m5, [esp+1*16] + %endif + mova %2, m4 +%endif + punpcklbw m4, m6, m5 + punpckhbw m6, m5 + + ; interleaved in m15,0,1,2,3,7,4,6 + punpcklwd m5, m15, m1 + punpckhwd m15, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m4 + punpckhwd m3, m4 +%if ARCH_X86_64 + SWAP 3, 4, 7 +%else + mova m4, %2 + mova %2, m3 +%endif + punpcklwd m3, m4, m6 + punpckhwd m4, m6 + + ; interleaved in m5,15,1,0,2,7,3,4 + punpckldq m6, m5, m2 + punpckhdq m5, m2 +%if ARCH_X86_64 + SWAP 2, 7, 5 +%else + mova m2, %2 + mova [esp+1*16], m5 +%endif + punpckldq m5, m15, m2 + punpckhdq m15, m2 + punpckldq m2, m1, m3 + punpckhdq m1, m3 + punpckldq m3, m0, m4 + punpckhdq m0, m4 + +%if ARCH_X86_32 + mova [esp+0*16], m6 + mova [esp+2*16], m5 + mova [esp+3*16], m15 + mova [esp+4*16], m2 + mova [esp+5*16], m1 + mova [esp+6*16], m3 + mova [esp+7*16], m0 + mova m8, [esp+ 8*16] + mova m9, [esp+ 9*16] + mova m10, [esp+10*16] + %if %1 == 0 + mova m11, [esp+11*16] + mova m12, [esp+12*16] + mova m13, [esp+13*16] + mova m14, [esp+14*16] + %else + mova m11, [esp+20*16] + mova m12, [esp+15*16] + mova m13, [esp+16*16] + mova m14, [esp+17*16] + %endif +%endif + + ; input in m8-m15 +%if ARCH_X86_64 + SWAP 7, 4 +%endif + punpcklbw m7, m8, m9 + punpckhbw m8, m9 + punpcklbw m9, m10, m11 + punpckhbw m10, m11 + punpcklbw m11, m12, m13 + punpckhbw m12, m13 +%if ARCH_X86_64 + mova m13, %2 +%else + %if %1 == 0 + mova m13, [esp+15*16] + %else + mova m13, [esp+18*16] + %endif +%endif + mova %2, m12 + punpcklbw m12, m14, m13 + punpckhbw m14, m14, m13 + + ; interleaved in m7,8,9,10,11,rsp%2,12,14 + punpcklwd m13, m7, m9 + punpckhwd m7, m9 + punpcklwd m9, m8, m10 + punpckhwd m8, m10 + punpcklwd m10, m11, m12 + punpckhwd m11, m12 + mova m12, %2 + mova %2, m11 + punpcklwd m11, m12, m14 + punpckhwd m12, m14 + + ; interleaved in m13,7,9,8,10,rsp%2,11,12 + punpckldq m14, m13, m10 + punpckhdq m13, m10 + punpckldq m10, m9, m11 + punpckhdq m9, m11 + punpckldq m11, m8, m12 + punpckhdq m8, m12 + mova m12, %2 + mova %2, m8 + punpckldq m8, m7, m12 + punpckhdq m7, m12 + +%if ARCH_X86_32 + mova [esp+ 8*16], m10 + mova [esp+ 9*16], m9 + mova [esp+10*16], m11 + SWAP 6, 1 + SWAP 4, 2 + SWAP 5, 3 + mova m6, [esp+0*16] + mova m4, [esp+1*16] + mova m5, [esp+2*16] +%endif + + ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7 + punpcklqdq m12, m6, m14 + punpckhqdq m6, m14 + punpcklqdq m14, m4, m13 + punpckhqdq m4, m13 + punpcklqdq m13, m5, m8 + punpckhqdq m5, m8 +%if ARCH_X86_64 + SWAP 8, 5 +%else + mova m8, [esp+3*16] + mova [esp+27*16], m5 + %define m15 m8 +%endif + punpcklqdq m5, m15, m7 + punpckhqdq m15, m7 + +%if ARCH_X86_32 + mova [esp+11*16], m12 + mova [esp+12*16], m6 + mova [esp+13*16], m14 + mova [esp+14*16], m4 + mova [esp+26*16], m13 + mova [esp+ 0*16], m5 + mova [esp+ 1*16], m15 + mova m2, [esp+ 4*16] + mova m10, [esp+ 8*16] + mova m1, [esp+ 5*16] + mova m9, [esp+ 9*16] + mova m3, [esp+ 6*16] + mova m11, [esp+10*16] + mova m0, [esp+ 7*16] +%endif + + punpcklqdq m7, m2, m10 + punpckhqdq m2, m10 + punpcklqdq m10, m1, m9 + punpckhqdq m1, m9 + punpcklqdq m9, m3, m11 + punpckhqdq m3, m11 + mova m11, %2 +%if ARCH_X86_32 + %define m12 m3 +%endif + mova %2, m12 + punpcklqdq m12, m0, m11 + punpckhqdq m0, m11 +%if %1 == 1 + mova m11, %2 +%endif + +%if ARCH_X86_64 + ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0 + SWAP 0, 11, 1, 6, 5, 8, 7, 15 + SWAP 2, 14, 12, 9 + SWAP 3, 4, 13 +%else + %if %1 == 0 + mova [esp+15*16], m9 + mova [esp+17*16], m12 + mova [esp+18*16], m0 + mova [esp+28*16], m10 + mova [esp+29*16], m1 + mova m3, [esp+0*16] + mova m4, [esp+1*16] + SWAP m5, m7 + SWAP m6, m2 + %else + SWAP 0, 7 + SWAP 3, 1, 2, 4, 6 + %endif +%endif +%endmacro + +%macro FILTER 2 ; width [4/6/8/16], dir [h/v] +%if ARCH_X86_64 + %define %%flat8mem [rsp+0*16] + %define %%q2mem [rsp+1*16] + %define %%q3mem [rsp+2*16] +%else + %if %1 == 4 || %1 == 6 + %define %%p2mem [esp+ 8*16] + %define %%q2mem [esp+ 9*16] + %define %%flat8mem [esp+10*16] + %else + %ifidn %2, v + %define %%p2mem [esp+16*16] + %define %%q2mem [esp+ 1*16] + %define %%q3mem [esp+18*16] + %define %%flat8mem [esp+ 0*16] + %define %%flat16mem [esp+20*16] + %else + %define %%p2mem [esp+27*16] + %define %%q2mem [esp+28*16] + %define %%q3mem [esp+29*16] + %define %%flat8mem [esp+21*16] + %define %%flat16mem [esp+30*16] + %endif + %endif + %xdefine m12reg m12 +%endif + +%if ARCH_X86_32 + lea stride3q, [strideq*3] +%endif + ; load data +%ifidn %2, v +%if ARCH_X86_32 + mov mstrideq, strideq + neg mstrideq +%endif +%if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + mova m3, [tmpq+strideq*0] ; p1 + mova m4, [tmpq+strideq*1] ; p0 + mova m5, [tmpq+strideq*2] ; q0 + mova m6, [tmpq+stride3q] ; q1 +%else + ; load 6-8 pixels, remainder (for wd=16) will be read inline + lea tmpq, [dstq+mstrideq*4] + ; we load p3 later +%define %%p3mem [dstq+mstrideq*4] + %if ARCH_X86_32 + %define m13 m0 + %define m14 m1 + %define m15 m2 + %endif + mova m13, [tmpq+strideq*1] + mova m3, [tmpq+strideq*2] + mova m4, [tmpq+stride3q] + mova m5, [dstq+strideq*0] + mova m6, [dstq+strideq*1] + mova m14, [dstq+strideq*2] +%if %1 != 6 + mova m15, [dstq+stride3q] +%endif + %if ARCH_X86_32 + mova %%p2mem, m13 + mova %%q2mem, m14 + %define m13 %%p2mem + %define m14 %%q2mem + %if %1 != 6 + mova %%q3mem, m15 + %define m15 %%q3mem + %endif + %endif +%endif +%else ; %2 == h + ; load lines +%if %1 == 4 + ; transpose 4x16 + movd m7, [dstq+strideq*0-2] + movd m3, [dstq+strideq*1-2] + movd m4, [dstq+strideq*2-2] + movd m5, [dstq+stride3q -2] + lea tmpq, [dstq+strideq*4] + punpcklbw m7, m3 + punpcklbw m4, m5 + movd m3, [tmpq+strideq*0-2] + movd m1, [tmpq+strideq*1-2] + movd m5, [tmpq+strideq*2-2] + movd m6, [tmpq+stride3q -2] + lea tmpq, [tmpq+strideq*4] + punpcklbw m3, m1 + punpcklbw m5, m6 + movd m0, [tmpq+strideq*0-2] + movd m1, [tmpq+strideq*1-2] + punpcklbw m0, m1 + movd m1, [tmpq+strideq*2-2] + movd m2, [tmpq+stride3q -2] + punpcklbw m1, m2 + punpcklqdq m7, m0 + punpcklqdq m4, m1 + lea tmpq, [tmpq+strideq*4] + movd m0, [tmpq+strideq*0-2] + movd m1, [tmpq+strideq*1-2] + punpcklbw m0, m1 + movd m1, [tmpq+strideq*2-2] + movd m2, [tmpq+stride3q -2] + punpcklbw m1, m2 + punpcklqdq m3, m0 + punpcklqdq m5, m1 + ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 + ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 + ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 + ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 + punpcklwd m6, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + ; xm6: A0-3,B0-3,C0-3,D0-3 + ; xm7: A8-11,B8-11,C8-11,D8-11 + ; xm4: A4-7,B4-7,C4-7,D4-7 + ; xm3: A12-15,B12-15,C12-15,D12-15 + punpckldq m5, m6, m4 + punpckhdq m6, m4 + punpckldq m4, m7, m3 + punpckhdq m7, m3 + ; xm5: A0-7,B0-7 + ; xm6: C0-7,D0-7 + ; xm4: A8-15,B8-15 + ; xm7: C8-15,D8-15 + punpcklqdq m3, m5, m4 + punpckhqdq m5, m5, m4 + punpcklqdq m4, m6, m7 + punpckhqdq m6, m7 + ; xm3: A0-15 + ; xm5: B0-15 + ; xm4: C0-15 + ; xm6: D0-15 + SWAP 4, 5 +%elif %1 == 6 || %1 == 8 + ; transpose 8x16 + movq m7, [dstq+strideq*0-%1/2] + movq m3, [dstq+strideq*1-%1/2] + movq m4, [dstq+strideq*2-%1/2] + movq m5, [dstq+stride3q -%1/2] + lea tmpq, [dstq+strideq*8] + punpcklbw m7, m3 + punpcklbw m4, m5 + movq m3, [tmpq+strideq*0-%1/2] + movq m1, [tmpq+strideq*1-%1/2] + movq m5, [tmpq+strideq*2-%1/2] + movq m6, [tmpq+stride3q -%1/2] + lea tmpq, [dstq+strideq*4] + punpcklbw m3, m1 + punpcklbw m5, m6 + movq m6, [tmpq+strideq*0-%1/2] + movq m0, [tmpq+strideq*1-%1/2] + movq m1, [tmpq+strideq*2-%1/2] + movq m2, [tmpq+stride3q -%1/2] + lea tmpq, [tmpq+strideq*8] + punpcklbw m6, m0 + punpcklbw m1, m2 + movq m2, [tmpq+strideq*2-%1/2] + movq m0, [tmpq+stride3q -%1/2] + punpcklbw m2, m0 +%if ARCH_X86_64 + SWAP m15, m2 +%else + %define m15 [esp+3*16] + mova m15, m2 +%endif + movq m0, [tmpq+strideq*0-%1/2] + movq m2, [tmpq+strideq*1-%1/2] + punpcklbw m0, m2 + ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 + ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 + ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 + ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 + ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 + ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 + ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 + ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 + punpcklwd m2, m7, m4 + punpckhwd m7, m4 + punpcklwd m4, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m6, m1 + punpckhwd m6, m1 + punpcklwd m1, m0, m15 + punpckhwd m0, m15 +%if ARCH_X86_64 + SWAP m15, m0 +%else + mova m15, m0 +%endif + ; xm2: A0-3,B0-3,C0-3,D0-3 + ; xm7: E0-3,F0-3,G0-3,H0-3 + ; xm4: A8-11,B8-11,C8-11,D8-11 + ; xm3: E8-11,F8-11,G8-11,H8-11 + ; xm5: A4-7,B4-7,C4-7,D4-7 + ; xm6: E4-7,F4-7,G4-7,H4-7 + ; xm1: A12-15,B12-15,C12-15,D12-15 + ; xm0: E12-15,F12-15,G12-15,H12-15 + punpckldq m0, m2, m5 + punpckhdq m2, m5 + punpckldq m5, m7, m6 +%if %1 != 6 + punpckhdq m7, m6 +%endif + punpckldq m6, m4, m1 + punpckhdq m4, m1 + punpckldq m1, m3, m15 +%if %1 != 6 + punpckhdq m3, m15 + %if ARCH_X86_64 + SWAP m15, m3 + %else + mova m15, m3 + %endif +%endif + ; xm0: A0-7,B0-7 + ; xm2: C0-7,D0-7 + ; xm5: E0-7,F0-7 + ; xm7: G0-7,H0-7 + ; xm6: A8-15,B8-15 + ; xm4: C8-15,D8-15 + ; xm1: E8-15,F8-15 + ; xm3: G8-15,H8-15 + punpcklqdq m3, m0, m6 + punpckhqdq m0, m6 + punpckhqdq m6, m2, m4 + punpcklqdq m2, m4 + punpcklqdq m4, m5, m1 + punpckhqdq m5, m1 +%if %1 == 8 + punpcklqdq m1, m7, m15 + punpckhqdq m7, m15 + ; xm3: A0-15 + ; xm0: B0-15 + ; xm2: C0-15 + ; xm6: D0-15 + ; xm4: E0-15 + ; xm5: F0-15 + ; xm1: G0-15 + ; xm7: H0-15 +%if ARCH_X86_64 + SWAP 11, 3, 2 + SWAP 13, 0 + SWAP 6, 5, 4 + SWAP 14, 1 + SWAP 15, 7 + ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15 + mova [rsp+21*16], m11 + %define %%p3mem [rsp+21*16] +%else + %define m11 [esp+26*16] + %define m13 [esp+27*16] + %define m14 [esp+28*16] + %define m15 [esp+29*16] + mova m11, m3 + mova m13, m0 + SWAP 3, 2 + SWAP 6, 5, 4 + mova m14, m1 + mova m15, m7 + %define %%p3mem [esp+26*16] +%endif +%else + %if ARCH_X86_64 + SWAP 13, 3, 0 + SWAP 14, 5, 6, 4, 2 + ; 3,0,2,6,4,5 -> 13,3,4,5,6,14 + %else + %define m13 %%p2mem + %define m14 %%q2mem + mova m13, m3 + mova m14, m5 + SWAP 3, 0 + SWAP 5, 6, 4, 2 + ; 0,2,6,4 -> 3,4,5,6 + %endif +%endif +%else +%if ARCH_X86_64 + mova [rsp+20*16], m12 +%endif + ; load and 16x16 transpose. We only use 14 pixels but we'll need the + ; remainder at the end for the second transpose +%if ARCH_X86_32 + %xdefine m8 m0 + %xdefine m9 m1 + %xdefine m10 m2 + %xdefine m11 m3 + %xdefine m12 m4 + %xdefine m13 m5 + %xdefine m14 m6 + %xdefine m15 m7 + lea tmpq, [dstq+strideq*8] + movu m8, [tmpq+strideq*0-8] + movu m9, [tmpq+strideq*1-8] + movu m10, [tmpq+strideq*2-8] + movu m11, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu m12, [tmpq+strideq*0-8] + movu m13, [tmpq+strideq*1-8] + movu m14, [tmpq+strideq*2-8] + movu m15, [tmpq+stride3q -8] + mova [esp+ 8*16], m8 + mova [esp+ 9*16], m9 + mova [esp+10*16], m10 + mova [esp+11*16], m11 + mova [esp+12*16], m12 + mova [esp+13*16], m13 + mova [esp+14*16], m14 + mova [esp+15*16], m15 +%endif + movu m0, [dstq+strideq*0-8] + movu m1, [dstq+strideq*1-8] + movu m2, [dstq+strideq*2-8] + movu m3, [dstq+stride3q -8] + lea tmpq, [dstq+strideq*4] + movu m4, [tmpq+strideq*0-8] + movu m5, [tmpq+strideq*1-8] + movu m6, [tmpq+strideq*2-8] + movu m7, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] +%if ARCH_X86_64 + movu m8, [tmpq+strideq*0-8] + movu m9, [tmpq+strideq*1-8] + movu m10, [tmpq+strideq*2-8] + movu m11, [tmpq+stride3q -8] + lea tmpq, [tmpq+strideq*4] + movu m12, [tmpq+strideq*0-8] + movu m13, [tmpq+strideq*1-8] + movu m14, [tmpq+strideq*2-8] + movu m15, [tmpq+stride3q -8] +%endif + +%if ARCH_X86_64 + TRANSPOSE_16X16B 0, [rsp+11*16] + mova [rsp+12*16], m1 + mova [rsp+13*16], m2 + mova [rsp+14*16], m3 + mova [rsp+15*16], m12 + mova [rsp+16*16], m13 + mova [rsp+17*16], m14 + mova [rsp+18*16], m15 + ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 + SWAP 12, 4, 7 + SWAP 13, 5, 8 + SWAP 3, 6, 9 + SWAP 10, 14 + SWAP 11, 15 + mova [rsp+21*16], m12 + %define %%p3mem [rsp+21*16] + mova m12, [rsp+20*16] +%else + TRANSPOSE_16X16B 0, [esp+16*16] + %define %%p3mem [esp+26*16] + %define m11 %%p3mem + %define m13 %%p2mem + %define m14 %%q2mem + %define m15 %%q3mem +%endif +%endif ; if 4 elif 6 or 8 else 16 +%endif ; if v else h + + ; load L/E/I/H +%if ARCH_X86_32 + mov l_strideq, l_stridem +%endif +%ifidn %2, v + movu m1, [lq] + movu m0, [lq+l_strideq] +%else + %if ARCH_X86_32 + lea l_stride3q, [l_strideq*3] + %endif + movq xm1, [lq] + movq xm2, [lq+l_strideq*2] + movhps xm1, [lq+l_strideq] + movhps xm2, [lq+l_stride3q] + shufps m0, m1, m2, q3131 + shufps m1, m2, q2020 + %if ARCH_X86_32 + lea stride3q, [strideq*3] + %endif +%endif + +%if ARCH_X86_32 + %ifidn %2, v + mov lutd, lutm + %endif +%endif + pxor m2, m2 + pcmpeqb m7, m2, m0 + pand m1, m7 + por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] + pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1] + pcmpeqb m2, m0 ; !L + psrlq m7, m0, [lutq+128] + pand m7, [PIC_sym(pb_63)] + pminub m7, minlvl + pmaxub m7, [PIC_sym(pb_1)] ; I + pand m1, m0, [PIC_sym(pb_240)] + psrlq m1, 4 ; H + paddb m0, [PIC_sym(pb_2)] + paddb m0, m0 + paddb m0, m7 ; E + pxor m1, [PIC_sym(pb_128)] + pxor m7, [PIC_sym(pb_128)] + pxor m0, [PIC_sym(pb_128)] + SWAP 2, 7 + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 2, 10 +%else + %ifidn %2, v + mov mstrideq, strideq + neg mstrideq + %if %1 == 4 + lea tmpq, [dstq+mstrideq*2] + %elif %1 == 6 || %1 == 8 + lea tmpq, [dstq+mstrideq*4] + %endif + %endif + mova [esp+3*16], m0 + mova [esp+4*16], m2 +%endif + + ABSSUB m0, m3, m4, m2 ; abs(p1-p0) + pmaxub m0, m7 + ABSSUB m2, m5, m6, m7 ; abs(q1-q0) + pmaxub m0, m2 +%if %1 == 4 + pxor m0, [PIC_sym(pb_128)] + pcmpgtb m7, m0, m1 ; hev + %if ARCH_X86_64 + SWAP 7, 11 + %else + mova [esp+5*16], m7 + %endif +%else + pxor m7, m0, [PIC_sym(pb_128)] + pcmpgtb m7, m1 ; hev +%if ARCH_X86_64 + SWAP 7, 11 +%else + mova [esp+5*16], m7 +%endif + +%if %1 == 6 + ABSSUB m1, m13, m4, m7 ; abs(p2-p0) + pmaxub m1, m0 +%else + mova m2, %%p3mem + ABSSUB m1, m2, m4, m7 ; abs(p3-p0) + pmaxub m1, m0 + ABSSUB m7, m13, m4, m2 ; abs(p2-p0) + pmaxub m1, m7 +%endif + ABSSUB m7, m5, m14, m2 ; abs(p2-p0) + pmaxub m1, m7 +%if %1 != 6 + ABSSUB m7, m5, m15, m2 ; abs(q3-q0) + pmaxub m1, m7 +%endif + pxor m1, [PIC_sym(pb_128)] + pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in +%if ARCH_X86_64 + SWAP 1, 9 +%else + mova [esp+6*16], m1 +%endif + +%if %1 == 6 + ABSSUB m7, m13, m3, m1 ; abs(p2-p1) +%else + mova m2, %%p3mem + ABSSUB m7, m2, m13, m1 ; abs(p3-p2) + ABSSUB m2, m13, m3, m1 ; abs(p2-p1) + pmaxub m7, m2 + ABSSUB m2, m14, m15, m1 ; abs(q3-q2) + pmaxub m7, m2 +%endif + ABSSUB m2, m14, m6, m1 ; abs(q2-q1) + pmaxub m7, m2 +%if ARCH_X86_32 + %define m12 m1 + mova m12, maskmem +%endif + pand m2, m12, mask1 + pcmpeqd m2, m12 + pand m7, m2 ; only apply fm-wide to wd>4 blocks + pmaxub m0, m7 + + pxor m0, [PIC_sym(pb_128)] +%endif ; %if %1 == 4 else +%if ARCH_X86_64 + SWAP 2, 10 + pcmpgtb m0, m2 +%else + pcmpgtb m0, [esp+4*16] +%endif + + ABSSUB m1, m3, m6, m7 ; abs(p1-q1) + ABSSUB m7, m4, m5, m2 ; abs(p0-q0) + paddusb m7, m7 + pand m1, [PIC_sym(pb_254)] + psrlq m1, 1 + paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1) + pxor m1, [PIC_sym(pb_128)] +%if ARCH_X86_64 + pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E +%else + pcmpgtb m1, [esp+3*16] +%endif + por m0, m1 + +%if %1 == 16 +%if ARCH_X86_64 + SWAP 0, 8 +%else + mova [esp+3*16], m0 +%endif +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+12*16] +%endif + ABSSUB m1, m0, m4, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+13*16] +%endif + ABSSUB m2, m0, m4, m7 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+stride3q] +%else + mova m0, [rsp+14*16] +%endif + ABSSUB m2, m0, m4, m7 + pmaxub m1, m2 +%ifidn %2, v + lea tmpq, [dstq+strideq*4] + mova m0, [tmpq+strideq*0] +%else + mova m0, [rsp+15*16] +%endif + ABSSUB m2, m0, m5, m7 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*1] +%else + mova m0, [rsp+16*16] +%endif + ABSSUB m2, m0, m5, m7 + pmaxub m1, m2 +%ifidn %2, v + mova m0, [tmpq+strideq*2] +%else + mova m0, [rsp+17*16] +%endif + ABSSUB m2, m0, m5, m7 + pmaxub m1, m2 + pxor m1, [PIC_sym(pb_128)] + pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out +%if ARCH_X86_64 + por m1, m9 ; !flat8in | !flat8out +%else + por m1, [esp+6*16] + %define m12 m7 + mova m12, maskmem +%endif + pand m2, m12, mask2 + pcmpeqd m2, m12 + pandn m1, m2 ; flat16 +%if ARCH_X86_64 + pandn m2, m8, m1 ; flat16 & fm +%else + pandn m2, [esp+3*16], m1 ; flat16 & fm + mova %%flat16mem, m2 +%endif + SWAP 1, 2 + + pand m2, m12, mask1 + pcmpeqd m2, m12 +%if ARCH_X86_64 + pandn m9, m2 ; flat8in + pandn m2, m8, m9 + SWAP 2, 9 +%else + pandn m0, [esp+6*16], m2 + pandn m2, [esp+3*16], m0 + mova [esp+6*16], m2 +%endif + pand m2, m12, mask0 + pcmpeqd m2, m12 +%if ARCH_X86_64 + pandn m8, m2 + pandn m2, m9, m8 ; fm & !flat8 & !flat16 + SWAP 2, 8 + pandn m2, m1, m9 ; flat8 & !flat16 + SWAP 2, 9 + SWAP 0, 8 + SWAP 1, 10 +%else + pandn m0, [esp+3*16], m2 + pandn m2, [esp+6*16], m0 + SWAP 2, 0 + pandn m2, m1, [esp+6*16] + mova %%flat8mem, m2 +%endif +%elif %1 != 4 + %if ARCH_X86_64 + SWAP 1, 9 + %else + %define m12 m7 + mova m12, maskmem + mova m1, [esp+6*16] + %endif + pand m2, m12, mask1 + pcmpeqd m2, m12 + pandn m1, m2 + pandn m2, m0, m1 ; flat8 & fm + pand m1, m12, mask0 + pcmpeqd m1, m12 + pandn m0, m1 + pandn m1, m2, m0 ; fm & !flat8 + SWAP 1, 2, 0 + %if ARCH_X86_64 + SWAP 1, 9 + %else + mova %%flat8mem, m1 + %endif +%else +%if ARCH_X86_32 + %define m12 m1 + mova m12, maskmem +%endif + pand m2, m12, mask0 + pcmpeqd m2, m12 + pandn m0, m2 ; fm +%endif + + ; short filter + + mova m1, [PIC_sym(pb_128)] +%if ARCH_X86_64 + SWAP 7, 11 +%else + mova m7, [esp+5*16] +%endif + pxor m3, m1 + pxor m6, m1 + pxor m4, m1 + pxor m5, m1 + psubsb m1, m3, m6 ; iclip_diff(p1-q1) + pand m1, m7 ; f=iclip_diff(p1-q1)&hev + psubsb m2, m5, m4 + paddsb m1, m2 + paddsb m1, m2 + paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f) + mova m2, [PIC_sym(pb_16)] + pand m0, m1 ; f&=fm + paddsb m1, m0, [PIC_sym(pb_3)] + paddsb m0, [PIC_sym(pb_4)] + pand m1, [PIC_sym(pb_248)] + pand m0, [PIC_sym(pb_248)] + psrlq m1, 3 + psrlq m0, 3 + pxor m1, m2 + pxor m0, m2 + psubb m1, m2 ; f2 + psubb m0, m2 ; f1 + mova m2, [PIC_sym(pb_128)] + paddsb m4, m1 + psubsb m5, m0 + pxor m4, m2 + pxor m5, m2 + + pxor m0, m2 + pxor m1, m1 + pavgb m0, m1 ; f=(f1+1)>>1 + psubb m0, [PIC_sym(pb_64)] + pandn m7, m0 ; f&=!hev + paddsb m3, m7 + psubsb m6, m7 + pxor m3, m2 + pxor m6, m2 + +%if %1 == 16 + ; flat16 filter +%ifidn %2, v + lea tmpq, [dstq+mstrideq*8] + mova m0, [tmpq+strideq*1] ; p6 + mova m2, [tmpq+strideq*2] ; p5 + mova m7, [tmpq+stride3q] ; p4 +%else + mova m0, [rsp+12*16] + mova m2, [rsp+13*16] + mova m7, [rsp+14*16] +%endif + +%if ARCH_X86_64 + SWAP 1, 10 + mova %%flat8mem, m9 + mova %%q2mem, m14 + mova %%q3mem, m15 + SWAP 0, 8 + SWAP 1, 9 +%else + %ifidn %2, v + mova [esp+17*16], m0 + mova [esp+19*16], m3 + mova [esp+21*16], m4 + mova [esp+22*16], m5 + mova [esp+23*16], m6 + %xdefine m11 m3 + %xdefine m14 m4 + %xdefine m15 m5 + %xdefine m10 m6 + %define m13 %%p2mem + %define m8 [esp+17*16] + %define m9 %%flat16mem + %define m3 [esp+19*16] + %define m4 [esp+21*16] + %define m5 [esp+22*16] + %define m6 [esp+23*16] + %else + mova [esp+31*16], m0 + mova [esp+32*16], m3 + mova [esp+33*16], m4 + mova [esp+34*16], m5 + mova [esp+35*16], m6 + %xdefine m11 m3 + %xdefine m14 m4 + %xdefine m15 m5 + %xdefine m10 m6 + %define m13 %%p2mem + %define m8 [esp+31*16] + %define m9 %%flat16mem + %define m3 [esp+32*16] + %define m4 [esp+33*16] + %define m5 [esp+34*16] + %define m6 [esp+35*16] + %endif +%endif + + ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A + ; write -6 + mova m11, %%p3mem +%if ARCH_X86_64 + punpcklbw m14, m8, m11 + punpckhbw m15, m8, m11 +%else + punpcklbw m14, m0, m11 + punpckhbw m15, m0, m11 +%endif +%ifidn %2, v + mova [rsp+5*16], m11 +%endif + pmaddubsw m10, m14, [PIC_sym(pb_7_1)] + pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3 + punpcklbw m0, m2, m7 + punpckhbw m1, m2, m7 + pmaddubsw m0, [PIC_sym(pb_2)] + pmaddubsw m1, [PIC_sym(pb_2)] + paddw m10, m0 + paddw m11, m1 ; p6*7+p5*2+p4*2+p3 + punpcklbw m0, m13, m3 + punpckhbw m1, m13, m3 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m10, m0 + paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1 + punpcklbw m0, m4, m5 + punpckhbw m1, m4, m5 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m10, m0 + paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m2 + por m0, m1 +%ifidn %2, v + mova [tmpq+strideq*2], m0 ; p5 +%else + mova [rsp+13*16], m0 +%endif + + ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B + ; write -5 + pmaddubsw m14, [PIC_sym(pb_m1_1)] + pmaddubsw m15, [PIC_sym(pb_m1_1)] + paddw m10, m14 + paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 + punpcklbw m0, m8, m6 + punpckhbw m1, m8, m6 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+3*16], m0 + mova [rsp+4*16], m1 + paddw m10, m0 + paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m7 + por m0, m1 +%ifidn %2, v + mova [tmpq+stride3q], m0 ; p4 +%else + mova [rsp+14*16], m0 +%endif + + ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C + ; write -4 + mova m14, %%q2mem + punpcklbw m0, m8, m13 + punpckhbw m1, m8, m13 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 + punpcklbw m0, m2, m14 + punpckhbw m2, m14 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m2, [PIC_sym(pb_m1_1)] + mova [rsp+1*16], m0 + paddw m10, m0 + paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, %%p3mem + por m0, m1 +%ifidn %2, v + mova [tmpq+strideq*4], m0 ; p3 +%else + mova [rsp+19*16], m0 +%endif + + ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D + ; write -3 + mova m15, %%q3mem + punpcklbw m0, m8, m3 + punpckhbw m1, m8, m3 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 + punpcklbw m0, m7, m15 + punpckhbw m7, m15 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m7, [PIC_sym(pb_m1_1)] + mova [rsp+2*16], m0 +%if ARCH_X86_32 + %ifidn %2, v + mova [esp+24*16], m7 + %else + mova [esp+36*16], m7 + %endif +%endif + paddw m10, m0 + paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m13 + por m0, m1 + mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F + + ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E + ; write -2 + punpcklbw m0, m8, m4 + punpckhbw m1, m8, m4 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 +%if ARCH_X86_64 + SWAP 7, 8 +%endif +%ifidn %2, v + mova m1, [dstq+strideq*4] ; q4 + mova m7, [rsp+5*16] ; (pre-filter) p3 +%else + mova m1, [rsp+15*16] + mova m7, %%p3mem ; (pre-filter) p3 +%endif + punpcklbw m0, m1, m7 + punpckhbw m1, m1, m7 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+7*16], m0 + mova [rsp+5*16], m1 + psubw m10, m0 + psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m3 + por m0, m1 + mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G + + ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F + ; write -1 +%ifidn %2, v + mova m7, [tmpq+strideq*1] ; p6 + lea tmpq, [dstq+strideq*4] + mova m1, [tmpq+strideq*1] ; q5 +%else + mova m7, [rsp+12*16] ; p6 + mova m1, [rsp+16*16] +%endif + punpcklbw m0, m7, m5 + punpckhbw m7, m5 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m7, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 + punpcklbw m7, m13, m1 + pmaddubsw m7, [PIC_sym(pb_m1_1)] + mova [rsp+9*16], m7 + paddw m10, m7 +%if ARCH_X86_64 + punpckhbw m13, m1 + mova m1, [rsp+6*16] + SWAP 1, 13 +%else + punpckhbw m7, m13, m1 + mova m1, [esp+6*16] + mova m13, m1 + SWAP 1, 7 +%endif + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+10*16], m1 + paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 + pmulhrsw m7, m10, [PIC_sym(pw_2048)] + pmulhrsw m0, m11, [PIC_sym(pw_2048)] + packuswb m7, m0 + pand m7, m9 + pandn m0, m9, m4 + por m7, m0 + mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H + + ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G + ; write +0 +%ifidn %2, v + mova m7, [tmpq+strideq*2] ; q6 +%else + mova m7, [rsp+17*16] +%endif + paddw m10, [rsp+3*16] + paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 + punpcklbw m0, m3, m7 + punpckhbw m1, m3, m7 +%if ARCH_X86_64 + mova m3, [rsp+8*16] +%endif + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + mova [rsp+3*16], m0 + mova [rsp+4*16], m1 + paddw m10, m0 + paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m5 + por m0, m1 +%if ARCH_X86_32 + mova m1, [esp+8*16] + mova m3, m1 +%endif + mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I + + ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H + ; write +1 + paddw m10, [rsp+1*16] + paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 + punpcklbw m0, m4, m7 + punpckhbw m2, m4, m7 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m2, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 +%if ARCH_X86_64 + mova m4, [rsp+6*16] +%else + %define m4 [esp+6*16] +%endif + pmulhrsw m2, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m2, m1 + pand m2, m9 + pandn m1, m9, m6 + por m2, m1 ; don't clobber q1/m6 since we need it in K + + ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I + ; write +2 + paddw m10, [rsp+2*16] +%if ARCH_X86_64 + SWAP 7, 8 + paddw m11, m7 +%else + mova m8, m7 + %ifidn %2, v + paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + %else + paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 + %endif +%endif + punpcklbw m0, m5, m8 + punpckhbw m1, m5, m8 +%if ARCH_X86_64 + mova m5, [rsp+8*16] +%else + %define m5 [esp+8*16] +%endif + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 + pmulhrsw m7, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m7, m1 + pand m7, m9 + pandn m1, m9, m14 + por m7, m1 ; don't clobber q2/m14 since we need it in K + + ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J + ; write +3 + psubw m10, [rsp+7*16] + psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 + punpcklbw m0, m6, m8 + punpckhbw m1, m6, m8 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m15 + por m0, m1 +%ifidn %2, v + mova [tmpq+mstrideq], m0 ; q3 +%else + mova [rsp+20*16], m0 +%endif + + ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K + ; write +4 + paddw m10, [rsp+ 9*16] + paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m0, m14, m8 + punpckhbw m1, m14, m8 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m0, m10, [PIC_sym(pw_2048)] + pmulhrsw m1, m11, [PIC_sym(pw_2048)] + packuswb m0, m1 + pand m0, m9 +%ifidn %2, v + pandn m1, m9, [tmpq+strideq*0] +%else + pandn m1, m9, [rsp+15*16] +%endif + por m0, m1 +%ifidn %2, v + mova [tmpq+strideq*0], m0 ; q4 +%else + mova [rsp+15*16], m0 +%endif + + ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L + ; write +5 + paddw m10, [rsp+3*16] + paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 + punpcklbw m0, m15, m8 + punpckhbw m1, m15, m8 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m10, m0 + paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 + pmulhrsw m10, [PIC_sym(pw_2048)] + pmulhrsw m11, [PIC_sym(pw_2048)] + packuswb m10, m11 + pand m10, m9 +%ifidn %2, v + pandn m11, m9, [tmpq+strideq*1] +%else + pandn m11, m9, [rsp+16*16] +%endif + por m10, m11 +%ifidn %2, v + mova [tmpq+strideq*1], m10 ; q5 +%else + mova [rsp+16*16], m10 +%endif + +%if ARCH_X86_64 + SWAP 0, 8 + SWAP 1, 9 + SWAP 14, 7 +%else + %xdefine m3 m11 + %xdefine m4 m14 + %xdefine m5 m15 + %xdefine m6 m10 + mova %%q2mem, m7 + %ifidn %2, v + mova m3, [esp+19*16] + %else + mova m3, [esp+32*16] + %endif + mova m4, [esp+ 6*16] + mova m5, [esp+ 8*16] +%endif + SWAP m6, m2 + +%if ARCH_X86_64 + mova m9, %%flat8mem +%endif +%ifidn %2, v + lea tmpq, [dstq+mstrideq*4] +%endif +%endif ; if %1 == 16 +%if %1 >= 8 + ; flat8 filter +%if ARCH_X86_32 + %define m9 %%flat8mem + %define m11 m1 + %define m13 %%p2mem + %define m14 %%q2mem + %define m15 %%q3mem +%endif + mova m11, %%p3mem + punpcklbw m0, m11, m3 + punpcklbw m7, m13, m4 + pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 + pmaddubsw m7, [PIC_sym(pb_2_1)] + paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + punpcklbw m7, m5, [PIC_sym(pb_4)] + pmaddubsw m7, [PIC_sym(pb_1)] + paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + punpckhbw m1, m11, m3 + pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 + punpckhbw m0, m13, m4 + pmaddubsw m0, [PIC_sym(pb_2_1)] + paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + punpckhbw m0, m5, [PIC_sym(pb_4)] + pmaddubsw m0, [PIC_sym(pb_1)] + paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m13 + por m0, m1 ; p2 +%ifidn %2, v + mova [tmpq+strideq*1], m0 +%else + %if ARCH_X86_64 + SWAP 0, 10 + %else + mova [esp+2*16], m0 + %endif +%endif + +%if ARCH_X86_32 + mova m11, %%p3mem +%endif + punpcklbw m0, m11, m3 + punpckhbw m1, m11, m3 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 + punpcklbw m0, m13, m6 + punpckhbw m1, m13, m6 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m3 + por m0, m1 ; p1 +%ifidn %2, v + mova [tmpq+strideq*2], m0 +%else + mova [rsp+0*16], m0 +%endif + +%if ARCH_X86_32 + mova m11, %%p3mem +%endif + punpcklbw m0, m11, m3 + punpckhbw m1, m11, m3 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + psubw m2, m0 + psubw m7, m1 + punpcklbw m0, m4, m14 + punpckhbw m1, m4, m14 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m2, m0 + paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m4 + por m0, m1 ; p0 +%ifidn %2, v + mova [tmpq+stride3q], m0 +%else + mova [rsp+1*16], m0 +%endif + + punpcklbw m0, m5, m15 + punpckhbw m1, m5, m15 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m2, m0 + paddw m7, m1 +%if ARCH_X86_32 + mova m11, %%p3mem +%endif + punpcklbw m0, m11, m4 + punpckhbw m11, m11, m4 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m11, [PIC_sym(pb_1)] + psubw m2, m0 + psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 + psrlw m0, m2, 3 + psrlw m11, m7, 3 + packuswb m0, m11 + pand m0, m9 + pandn m11, m9, m5 + por m11, m0 ; q0 +%ifidn %2, v + mova [dstq+strideq*0], m11 +%elif ARCH_X86_32 + mova [esp+8*16], m11 +%endif + + punpcklbw m0, m5, m15 + punpckhbw m1, m5, m15 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 + punpcklbw m0, m13, m6 + punpckhbw m1, m13, m6 + pmaddubsw m0, [PIC_sym(pb_m1_1)] + pmaddubsw m1, [PIC_sym(pb_m1_1)] + paddw m2, m0 + paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 + psrlw m0, m2, 3 + psrlw m1, m7, 3 + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m6 + por m0, m1 ; q1 +%ifidn %2, v + mova [dstq+strideq*1], m0 +%else + %if ARCH_X86_64 + SWAP 0, 13 + %else + mova [esp+9*16], m0 + %endif +%endif + + punpcklbw m0, m3, m6 + punpckhbw m1, m3, m6 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + psubw m2, m0 + psubw m7, m1 + punpcklbw m0, m14, m15 + punpckhbw m1, m14, m15 + pmaddubsw m0, [PIC_sym(pb_1)] + pmaddubsw m1, [PIC_sym(pb_1)] + paddw m2, m0 + paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 + psrlw m2, 3 + psrlw m7, 3 + packuswb m2, m7 + pand m2, m9 + pandn m7, m9, m14 + por m2, m7 ; q2 +%ifidn %2, v + mova [dstq+strideq*2], m2 +%else + mova m0, [rsp+0*16] +%if %1 == 8 + mova m1, [rsp+1*16] + mova m4, %%p3mem + +%if ARCH_X86_32 + %define m10 [esp+2*16] + %define m11 [esp+8*16] + %define m13 [esp+9*16] +%endif + + ; 16x8 transpose + punpcklbw m3, m4, m10 + punpckhbw m4, m10 + punpcklbw m5, m0, m1 + punpckhbw m0, m1 + punpcklbw m1, m11, m13 + punpckhbw m6, m11, m13 + punpcklbw m7, m2, m15 + punpckhbw m2, m15 +%if ARCH_X86_64 + SWAP 2, 15 +%else + mova m15, m2 +%endif + + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m7 + punpckhwd m1, m7 + punpcklwd m7, m6, m15 + punpckhwd m6, m15 +%if ARCH_X86_64 + SWAP 6, 15 +%else + mova m15, m6 +%endif + + punpckldq m6, m2, m0 + punpckhdq m2, m0 + punpckldq m0, m3, m1 + punpckhdq m3, m1 + punpckldq m1, m5, m7 + punpckhdq m5, m7 + punpckldq m7, m4, m15 + punpckhdq m4, m15 + + ; write 8x16 + movq [dstq+strideq*0-4], xm6 + movhps [dstq+strideq*1-4], xm6 + movq [dstq+strideq*2-4], xm2 + movhps [dstq+stride3q -4], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm0 + movhps [dstq+strideq*1-4], xm0 + movq [dstq+strideq*2-4], xm3 + movhps [dstq+stride3q -4], xm3 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm1 + movhps [dstq+strideq*1-4], xm1 + movq [dstq+strideq*2-4], xm5 + movhps [dstq+stride3q -4], xm5 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0-4], xm7 + movhps [dstq+strideq*1-4], xm7 + movq [dstq+strideq*2-4], xm4 + movhps [dstq+stride3q -4], xm4 + lea dstq, [dstq+strideq*4] +%else + ; 16x16 transpose and store + SWAP 6, 0 + SWAP 7, 1 + %if ARCH_X86_64 + SWAP 5, 10, 2 + SWAP 8, 11 + SWAP 9, 13 + mova [rsp+21*16], m12 + %else + mova [esp+10*16], m2 + %xdefine m8 m0 + %xdefine m9 m1 + %xdefine m10 m2 + %xdefine m11 m3 + %xdefine m12 m4 + %xdefine m13 m5 + %xdefine m14 m6 + %xdefine m15 m7 + %endif + mova m0, [rsp+11*16] + mova m1, [rsp+12*16] + mova m2, [rsp+13*16] + mova m3, [rsp+14*16] + mova m4, [rsp+19*16] +%if ARCH_X86_64 + mova m7, [rsp+ 1*16] + mova m11, [rsp+20*16] + mova m12, [rsp+15*16] + mova m13, [rsp+16*16] + mova m14, [rsp+17*16] + TRANSPOSE_16X16B 1, [rsp+18*16] +%else + mova m5, [esp+ 2*16] + TRANSPOSE_16X16B 1, [esp+32*16] + mov tmpq, dstq + lea dstq, [dstq+strideq*8] +%endif + movu [dstq+strideq*0-8], xm0 + movu [dstq+strideq*1-8], xm1 + movu [dstq+strideq*2-8], xm2 + movu [dstq+stride3q -8], xm3 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm4 + movu [dstq+strideq*1-8], xm5 + movu [dstq+strideq*2-8], xm6 + movu [dstq+stride3q -8], xm7 +%if ARCH_X86_64 + lea dstq, [dstq+strideq*4] +%else + %xdefine m8 m0 + %xdefine m9 m1 + %xdefine m10 m2 + %xdefine m11 m3 + %xdefine m12 m4 + %xdefine m13 m5 + %xdefine m14 m6 + %xdefine m15 m7 + mova m8, [esp+11*16] + mova m9, [esp+12*16] + mova m10, [esp+13*16] + mova m11, [esp+14*16] + mova m12, [esp+26*16] + mova m13, [esp+27*16] + mova m14, [esp+ 0*16] + mova m15, [esp+ 1*16] + mov dstq, tmpq +%endif + movu [dstq+strideq*0-8], xm8 + movu [dstq+strideq*1-8], xm9 + movu [dstq+strideq*2-8], xm10 + movu [dstq+stride3q -8], xm11 + lea dstq, [dstq+strideq*4] + movu [dstq+strideq*0-8], xm12 + movu [dstq+strideq*1-8], xm13 + movu [dstq+strideq*2-8], xm14 + movu [dstq+stride3q -8], xm15 + lea dstq, [dstq+strideq*4] +%if ARCH_X86_32 + lea dstq, [dstq+strideq*8] +%else + mova m12, [rsp+21*16] +%endif + +%endif ; if %1 == 8 +%endif ; ifidn %2, v +%elif %1 == 6 + ; flat6 filter +%if ARCH_X86_32 + mova [esp+3*16], m3 + mova [esp+4*16], m4 + mova [esp+5*16], m5 + mova [esp+6*16], m6 + %xdefine m8 m3 + %xdefine m10 m4 + %xdefine m11 m5 + %xdefine m15 m6 + %define m3 [esp+3*16] + %define m4 [esp+4*16] + %define m5 [esp+5*16] + %define m6 [esp+6*16] + %define m9 %%flat8mem + %define m13 %%p2mem + %define m14 %%q2mem +%endif + + punpcklbw m8, m13, m5 + punpckhbw m11, m13, m5 + pmaddubsw m0, m8, [PIC_sym(pb_3_1)] + pmaddubsw m1, m11, [PIC_sym(pb_3_1)] + punpcklbw m7, m4, m3 + punpckhbw m10, m4, m3 + pmaddubsw m2, m7, [PIC_sym(pb_2)] + pmaddubsw m15, m10, [PIC_sym(pb_2)] + paddw m0, m2 + paddw m1, m15 + pmulhrsw m2, m0, [PIC_sym(pw_4096)] + pmulhrsw m15, m1, [PIC_sym(pw_4096)] + packuswb m2, m15 + pand m2, m9 + pandn m15, m9, m3 + por m2, m15 +%ifidn %2, v + mova [tmpq+strideq*2], m2 ; p1 +%elif ARCH_X86_32 + mova [esp+11*16], m2 +%endif + + pmaddubsw m8, [PIC_sym(pb_m1_1)] + pmaddubsw m11, [PIC_sym(pb_m1_1)] + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m13, m6 + punpckhbw m11, m13, m6 +%if ARCH_X86_64 + SWAP 2, 13 +%endif + pmaddubsw m8, [PIC_sym(pb_m1_1)] + pmaddubsw m11, [PIC_sym(pb_m1_1)] + paddw m0, m8 + paddw m1, m11 + pmulhrsw m2, m0, [PIC_sym(pw_4096)] + pmulhrsw m15, m1, [PIC_sym(pw_4096)] + packuswb m2, m15 + pand m2, m9 + pandn m15, m9, m4 + por m2, m15 +%ifidn %2, v + mova [tmpq+stride3q], m2 ; p0 +%elif ARCH_X86_32 + mova [esp+8*16], m2 +%endif + + paddw m0, m8 + paddw m1, m11 + punpcklbw m8, m3, m14 + punpckhbw m11, m3, m14 +%if ARCH_X86_64 + SWAP 2, 14 +%endif + pmaddubsw m2, m8, [PIC_sym(pb_m1_1)] + pmaddubsw m15, m11, [PIC_sym(pb_m1_1)] + paddw m0, m2 + paddw m1, m15 + pmulhrsw m2, m0, [PIC_sym(pw_4096)] + pmulhrsw m15, m1, [PIC_sym(pw_4096)] + packuswb m2, m15 + pand m2, m9 + pandn m15, m9, m5 + por m2, m15 +%ifidn %2, v + mova [dstq+strideq*0], m2 ; q0 +%endif + + pmaddubsw m8, [PIC_sym(pb_m1_2)] + pmaddubsw m11, [PIC_sym(pb_m1_2)] + paddw m0, m8 + paddw m1, m11 + pmaddubsw m7, [PIC_sym(pb_m1_0)] + pmaddubsw m10, [PIC_sym(pb_m1_0)] + paddw m0, m7 + paddw m1, m10 + pmulhrsw m0, [PIC_sym(pw_4096)] + pmulhrsw m1, [PIC_sym(pw_4096)] + packuswb m0, m1 + pand m0, m9 + pandn m1, m9, m6 + por m0, m1 +%if ARCH_X86_32 + %xdefine m3 m8 + %xdefine m4 m10 + %xdefine m5 m11 + %xdefine m6 m15 +%endif +%ifidn %2, v + mova [dstq+strideq*1], m0 ; q1 +%else + %if ARCH_X86_64 + SWAP 3, 13 + SWAP 4, 14 + %else + mova m3, [esp+11*16] + mova m4, [esp+ 8*16] + %endif + SWAP 5, 2 + SWAP 6, 0 + TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 +%endif +%else ; if %1 == 4 +%ifidn %2, v + mova [tmpq+strideq*0], m3 ; p1 + mova [tmpq+strideq*1], m4 ; p0 + mova [tmpq+strideq*2], m5 ; q0 + mova [tmpq+stride3q ], m6 ; q1 +%else + TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 +%endif +%endif +%if ARCH_X86_32 + %define m12 m12reg +%endif +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; 32-bit PIC helpers ;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%if ARCH_X86_32 + %define PIC_base_offset $$ + + %macro SETUP_PIC 0 ; PIC_reg + %define PIC_reg r2 + %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4) + LEA PIC_reg, $$ + %endmacro + + %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base + %if %1 == 0 + mov [esp+PIC_reg_stk_offset], PIC_reg + mov PIC_reg, maskm + %else + mov PIC_reg, [esp+PIC_reg_stk_offset] + %endif + %endmacro + + %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) + +%else + %macro XCHG_PIC_REG 1 + %endmacro + %define PIC_sym(sym) (sym) +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < required_stack_alignment + %assign copy_args 1 + %else + %assign copy_args 0 + %endif +%endif + +%macro RELOC_ARGS 1 + %if copy_args + %define maskm [esp+stack_size-gprsize*1] + %define l_stridem [esp+stack_size-gprsize*2] + %define lutm [esp+stack_size-gprsize*3] + %define %1m [esp+stack_size-gprsize*4] + mov r6d, r6m + mov maskm, maskd + mov lutm, lutd + mov %1m, r6d + %else + %define %1m r6m + %endif +%endmacro + +%if ARCH_X86_32 + %define tmpq r4 + %define mstrideq r5 + %define stride3q r6 + %define l_stride3q r6 +%endif + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits +%else +cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS w + SETUP_PIC + %define m12 m5 +%endif + shl l_strideq, 2 + sub lq, l_strideq +%if ARCH_X86_64 + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movu m0, [maskq] + pxor m4, m4 + movd m3, [lutq+136] + pshufb m3, m4 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m1, m2 + por m0, m1 + mova [rsp+11*16], m0 + mova [rsp+12*16], m1 + mova [rsp+13*16], m2 + mova [rsp+14*16], m3 + +%define maskmem [esp+15*16] +%define mask0 [rsp+11*16] +%define mask1 [rsp+12*16] +%define mask2 [rsp+13*16] +%define minlvl [rsp+14*16] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + je .no_flat16 + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+25*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 16, v + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+25*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 8, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+25*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, v + +.end: +%if ARCH_X86_32 + mova m12, maskmem + mov mask_bitsd, [esp+25*16] +%endif +.no_filter: + pslld m12, 4 + shl mask_bitsd, 4 + add lq, 16 + add dstq, 16 +%if ARCH_X86_64 + sub wd, 4 +%else + sub dword wm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits +%else +cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS h + SETUP_PIC + %define m12 m5 +%endif + sub lq, 4 + shl l_strideq, 2 +%if ARCH_X86_64 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movu m0, [maskq] + pxor m4, m4 + movd m3, [lutq+136] + pshufb m3, m4 + pshufd m2, m0, q2222 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m1, m2 + por m0, m1 + mova [rsp+22*16], m0 + mova [rsp+23*16], m1 + mova [rsp+24*16], m2 + mova [rsp+25*16], m3 + +%define maskmem [esp+37*16] +%define mask0 [rsp+22*16] +%define mask1 [rsp+23*16] +%define mask2 [rsp+24*16] +%define minlvl [rsp+25*16] + +.loop: + test [maskq+8], mask_bitsd ; vmask[2] + je .no_flat16 + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+38*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 16, h + jmp .end + +.no_flat16: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+38*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 8, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[0] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+38*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +%if ARCH_X86_32 + jmp .end_noload +.end: + mova m12, maskmem + mov l_strideq, l_stridem + mov mask_bitsd, [esp+38*16] +.end_noload: +%else +.end: +%endif + lea lq, [lq+l_strideq*4] + pslld m12, 4 + shl mask_bitsd, 4 +%if ARCH_X86_64 + sub hd, 4 +%else + sub dword hm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \ + dst, stride, mask, l, l_stride, lut, \ + w, stride3, mstride, tmp, mask_bits +%else +cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS w + SETUP_PIC + %define m12 m4 +%endif + shl l_strideq, 2 + sub lq, l_strideq +%if ARCH_X86_64 + mov mstrideq, strideq + neg mstrideq + lea stride3q, [strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movq m0, [maskq] + pxor m3, m3 + movd m2, [lutq+136] + pshufb m2, m3 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m0, m1 + mova [rsp+0*16], m0 + mova [rsp+1*16], m1 + mova [rsp+2*16], m2 + +%define maskmem [esp+7*16] +%define mask0 [rsp+0*16] +%define mask1 [rsp+1*16] +%define minlvl [rsp+2*16] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+11*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 6, v + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[1] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+11*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, v + +.end: +%if ARCH_X86_32 + mova m12, maskmem + mov mask_bitsd, [esp+11*16] +%endif +.no_filter: + pslld m12, 4 + shl mask_bitsd, 4 + add lq, 16 + add dstq, 16 +%if ARCH_X86_64 + sub wd, 4 +%else + sub dword wm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET + +INIT_XMM ssse3 +%if ARCH_X86_64 +cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \ + dst, stride, mask, l, l_stride, lut, \ + h, stride3, l_stride3, tmp, mask_bits +%else +cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \ + dst, stride, mask, l, l_stride, lut, mask_bits + RELOC_ARGS h + SETUP_PIC + %define m12 m4 +%endif + sub lq, 4 + shl l_strideq, 2 +%if ARCH_X86_64 + lea stride3q, [strideq*3] + lea l_stride3q, [l_strideq*3] +%else + mov l_stridem, l_strided +%endif + mov mask_bitsd, 0xf + mova m12, [PIC_sym(pd_mask)] + XCHG_PIC_REG 0 + movq m0, [maskq] + pxor m3, m3 + movd m2, [lutq+136] + pshufb m2, m3 + pshufd m1, m0, q1111 + pshufd m0, m0, q0000 + por m0, m1 + mova [rsp+0*16], m0 + mova [rsp+1*16], m1 + mova [rsp+2*16], m2 + +%define maskmem [esp+7*16] +%define mask0 [rsp+0*16] +%define mask1 [rsp+1*16] +%define minlvl [rsp+2*16] + +.loop: + test [maskq+4], mask_bitsd ; vmask[1] + je .no_flat + +%if ARCH_X86_32 + XCHG_PIC_REG 1 + mov [esp+12*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 6, h + jmp .end + +.no_flat: + test [maskq+0], mask_bitsd ; vmask[1] + XCHG_PIC_REG 1 + je .no_filter + +%if ARCH_X86_32 + mov [esp+12*16], mask_bitsd + mova maskmem, m12 +%endif + FILTER 4, h + jmp .end + +.no_filter: + lea dstq, [dstq+strideq*8] + lea dstq, [dstq+strideq*8] +%if ARCH_X86_32 + jmp .end_noload +.end: + mova m12, maskmem + mov l_strided, l_stridem + mov mask_bitsd, [esp+12*16] +.end_noload: +%else +.end: +%endif + lea lq, [lq+l_strideq*4] + pslld m12, 4 + shl mask_bitsd, 4 +%if ARCH_X86_64 + sub hd, 4 +%else + sub dword hm, 4 +%endif + XCHG_PIC_REG 0 + jg .loop + RET diff --git a/third_party/dav1d/src/x86/looprestoration.h b/third_party/dav1d/src/x86/looprestoration.h new file mode 100644 index 0000000000..de23be8866 --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration.h @@ -0,0 +1,94 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/looprestoration.h" + +#include "common/intops.h" + +#define decl_wiener_filter_fns(ext) \ +decl_lr_filter_fn(BF(dav1d_wiener_filter7, ext)); \ +decl_lr_filter_fn(BF(dav1d_wiener_filter5, ext)) + +#define decl_sgr_filter_fns(ext) \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_5x5, ext)); \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_3x3, ext)); \ +decl_lr_filter_fn(BF(dav1d_sgr_filter_mix, ext)) + +decl_wiener_filter_fns(sse2); +decl_wiener_filter_fns(ssse3); +decl_wiener_filter_fns(avx2); +decl_wiener_filter_fns(avx512icl); +decl_sgr_filter_fns(ssse3); +decl_sgr_filter_fns(avx2); +decl_sgr_filter_fns(avx512icl); + +static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; +#if BITDEPTH == 8 + c->wiener[0] = BF(dav1d_wiener_filter7, sse2); + c->wiener[1] = BF(dav1d_wiener_filter5, sse2); +#endif + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + c->wiener[0] = BF(dav1d_wiener_filter7, ssse3); + c->wiener[1] = BF(dav1d_wiener_filter5, ssse3); + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3); + c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3); + } + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->wiener[0] = BF(dav1d_wiener_filter7, avx2); + c->wiener[1] = BF(dav1d_wiener_filter5, avx2); + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); + c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); + } + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->wiener[0] = BF(dav1d_wiener_filter7, avx512icl); +#if BITDEPTH == 8 + /* With VNNI we don't need a 5-tap version. */ + c->wiener[1] = c->wiener[0]; +#else + c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl); +#endif + if (BITDEPTH == 8 || bpc == 10) { + c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl); + c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl); + c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl); + } +#endif +} diff --git a/third_party/dav1d/src/x86/looprestoration16_avx2.asm b/third_party/dav1d/src/x86/looprestoration16_avx2.asm new file mode 100644 index 0000000000..ef25c28474 --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration16_avx2.asm @@ -0,0 +1,2540 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 +wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 +wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 +pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + +wiener_hshift: dw 4, 4, 1, 1 +wiener_vshift: dw 1024, 1024, 4096, 4096 +wiener_round: dd 1049600, 1048832 + +pb_m10_m9: times 2 db -10, -9 +pb_m6_m5: times 2 db -6, -5 +pb_m2_m1: times 2 db -2, -1 +pb_2_3: times 2 db 2, 3 +pb_6_7: times 2 db 6, 7 +pw_1023: times 2 dw 1023 +pd_8: dd 8 +pd_25: dd 25 +pd_4096: dd 4096 +pd_34816: dd 34816 +pd_m262128: dd -262128 +pd_0xf00800a4: dd 0xf00800a4 +pd_0xf00801c7: dd 0xf00801c7 + +%define pw_256 sgr_lshuf5 + +cextern sgr_x_by_x_avx2 + +SECTION .text + +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers + +INIT_YMM avx2 +cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt +%define base t4-wiener_hshift + mov fltq, r6mp + movifnidn wd, wm + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max + vbroadcasti128 m6, [wiener_shufA] + vpbroadcastd m12, [fltq+ 0] ; x0 x1 + lea t4, [wiener_hshift] + vbroadcasti128 m7, [wiener_shufB] + add wd, wd + vpbroadcastd m13, [fltq+ 4] ; x2 x3 + shr t3d, 11 + vpbroadcastd m14, [fltq+16] ; y0 y1 + add lpfq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + add dstq, wq + vbroadcasti128 m8, [wiener_shufC] + lea t1, [rsp+wq+16] + vbroadcasti128 m9, [wiener_shufD] + neg wq + vpbroadcastd m0, [base+wiener_hshift+t3*4] + vpbroadcastd m10, [base+wiener_round+t3*4] + vpbroadcastd m11, [base+wiener_vshift+t3*4] + pmullw m12, m0 ; upshift filter coefs to make the + pmullw m13, m0 ; horizontal downshift constant + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.extend_right: + movd xm1, r10d + vpbroadcastd m0, [pb_6_7] + movu m2, [pb_0to31] + vpbroadcastb m1, xm1 + psubb m0, m1 + pminub m0, m2 + pshufb m3, m0 + vpbroadcastd m0, [pb_m2_m1] + psubb m0, m1 + pminub m0, m2 + pshufb m4, m0 + vpbroadcastd m0, [pb_m10_m9] + psubb m0, m1 + pminub m0, m2 + pshufb m5, m0 + ret +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq xm3, [leftq] + vpblendd m3, [lpfq+r10-8], 0xfc + add leftq, 8 + jmp .h_main +.h_extend_left: + vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + shufpd m3, m4, 0x05 + pshufb m3, [wiener_lshuf7] + jmp .h_main2 +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+r10-8] +.h_main: + mova m4, [lpfq+r10+0] +.h_main2: + movu m5, [lpfq+r10+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18) + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+r10], m0 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq xm3, [leftq] + vpblendd m3, [lpfq+r10-8], 0xfc + add leftq, 8 + jmp .hv_main +.hv_extend_left: + movu m3, [lpfq+r10-8] + pshufb m3, [wiener_lshuf7] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+r10-8] +.hv_main: + mova m4, [lpfq+r10+0] + movu m5, [lpfq+r10+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -36 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + vpbroadcastd m2, [pd_m262128] + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + mova m2, [t4+r10] + paddw m2, [t2+r10] + mova m5, [t3+r10] + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova m4, [t5+r10] + paddw m4, [t1+r10] + psraw m0, 1 + paddw m3, m0, [t6+r10] + mova [t0+r10], m0 + punpcklwd m0, m2, m5 + pmaddwd m0, m15 + punpckhwd m2, m5 + pmaddwd m2, m15 + punpcklwd m1, m3, m4 + pmaddwd m1, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m2, m10 + paddd m0, m1 + paddd m2, m3 + psrad m0, 5 + psrad m2, 5 + packusdw m0, m2 + pmulhuw m0, m11 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m1, [t4+r10] + paddw m1, [t2+r10] + mova m2, [t3+r10] + mova m4, [t1+r10] + paddw m3, m4, [t6+r10] + paddw m4, [t5+r10] + punpcklwd m0, m1, m2 + pmaddwd m0, m15 + punpckhwd m1, m2 + pmaddwd m1, m15 + punpcklwd m2, m3, m4 + pmaddwd m2, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m1, m10 + paddd m0, m2 + paddd m1, m3 + psrad m0, 5 + psrad m1, 5 + packusdw m0, m1 + pmulhuw m0, m11 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret + +cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt +%define base t4-wiener_hshift + mov fltq, r6mp + movifnidn wd, wm + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max + vbroadcasti128 m5, [wiener_shufE] + vpbroadcastw m11, [fltq+ 2] ; x1 + vbroadcasti128 m6, [wiener_shufB] + lea t4, [wiener_hshift] + vbroadcasti128 m7, [wiener_shufD] + add wd, wd + vpbroadcastd m12, [fltq+ 4] ; x2 x3 + shr t3d, 11 + vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) + add lpfq, wq + vpbroadcastw m13, [fltq+18] ; y1 + add dstq, wq + vpbroadcastd m14, [fltq+20] ; y2 y3 + lea t1, [rsp+wq+16] + neg wq + vpbroadcastd m0, [base+wiener_hshift+t3*4] + vpbroadcastd m9, [base+wiener_round+t3*4] + vpbroadcastd m10, [base+wiener_vshift+t3*4] + movu xm15, [wiener_lshuf5] + pmullw m11, m0 + vinserti128 m15, [pb_0to31], 1 + pmullw m12, m0 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.end: + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq +.v1: + call .v + jmp .end +.extend_right: + movd xm2, r10d + vpbroadcastd m0, [pb_2_3] + vpbroadcastd m1, [pb_m6_m5] + vpbroadcastb m2, xm2 + psubb m0, m2 + psubb m1, m2 + movu m2, [pb_0to31] + pminub m0, m2 + pminub m1, m2 + pshufb m3, m0 + pshufb m4, m1 + ret +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm3, [leftq+4] + vpblendd m3, [lpfq+r10-4], 0xfe + add leftq, 8 + jmp .h_main +.h_extend_left: + vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located + mova m3, [lpfq+r10] ; before the start of the buffer + palignr m3, m4, 12 + pshufb m3, m15 + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+r10-4] +.h_main: + movu m4, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + paddd m1, m3 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+r10], m0 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm3, [leftq+4] + vpblendd m3, [lpfq+r10-4], 0xfe + add leftq, 8 + jmp .hv_main +.hv_extend_left: + movu m3, [lpfq+r10-4] + pshufb m3, m15 + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+r10-4] +.hv_main: + movu m4, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -34 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + mova m2, [t3+r10] + paddw m2, [t1+r10] + paddd m1, m3 + mova m4, [t2+r10] + punpckhwd m3, m2, m4 + pmaddwd m3, m14 + punpcklwd m2, m4 + mova m4, [t4+r10] + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + pmaddwd m2, m14 + psraw m0, 1 + mova [t0+r10], m0 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 5 + psrad m0, 5 + packusdw m0, m1 + pmulhuw m0, m10 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m0, [t1+r10] + paddw m2, m0, [t3+r10] + mova m1, [t2+r10] + mova m4, [t4+r10] + punpckhwd m3, m2, m1 + pmaddwd m3, m14 + punpcklwd m2, m1 + pmaddwd m2, m14 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 5 + psrad m0, 5 + packusdw m0, m1 + pmulhuw m0, m10 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + ret + +cglobal sgr_filter_5x5_16bpc, 4, 14, 15, 400*24+16, dst, stride, left, lpf, \ + w, h, edge, params + movifnidn wd, wm + mov paramsq, r6mp + lea r13, [sgr_x_by_x_avx2+256*4] + movifnidn hd, hm + mov edged, r7m + add wd, wd + vpbroadcastw m7, [paramsq+8] ; w0 + add lpfq, wq + vpbroadcastd m8, [pd_8] + add dstq, wq + vpbroadcastd m9, [pd_25] + lea t3, [rsp+wq*2+400*12+16] + vpbroadcastd m10, [paramsq+0] ; s0 + lea t4, [rsp+wq+400*20+16] + vpbroadcastd m11, [pd_0xf00800a4] + lea t1, [rsp+wq+20] + mova xm12, [sgr_lshuf5] + neg wq + vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15) + pxor m6, m6 + vpbroadcastd m14, [pw_1023] + psllw m7, 4 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + test hd, hd + jz .odd_height + call .h + add lpfq, strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .h_top + add lpfq, strideq + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+400*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.extend_right: + vpbroadcastw m0, [lpfq-2] + movu m1, [r13+r10+ 0] + movu m2, [r13+r10+16] + vpblendvb m4, m0, m1 + vpblendvb m5, m0, m2 + ret +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm12 + vinserti128 m4, [lpfq+wq+10], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10- 2] +.h_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m2, m5, m4, 2 + paddw m0, m4, m2 + palignr m3, m5, m4, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + shufpd m5, m4, m5, 0x05 + paddw m0, m5 + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + paddd m1, m3 + punpckhwd m3, m4, m5 + pmaddwd m3, m3 + shufps m4, m5, q2121 + paddw m0, m4 ; sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m2, m3 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10+400*0] + paddd m1, [t1+r10+400*2] + paddd m2, [t1+r10+400*4] +.h_loop_end: + paddd m1, m5 ; sumsq + paddd m2, m4 + mova [t1+r10+400*0], m0 + mova [t1+r10+400*2], m1 + mova [t1+r10+400*4], m2 + add r10, 32 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-4] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10+400*0] + mova m1, [t1+r10+400*2] + mova m2, [t1+r10+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m1 + mova [t2+r10+400*4], m2 + add r10, 32 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv_main +.hv_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm12 + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv_main +.hv_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10- 2] +.hv_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -36 + jl .hv_have_right + call .extend_right +.hv_have_right: + palignr m3, m5, m4, 2 + paddw m0, m4, m3 + palignr m1, m5, m4, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + shufpd m5, m4, m5, 0x05 + paddw m0, m5 + punpcklwd m1, m4, m5 + pmaddwd m1, m1 + paddd m2, m1 + punpckhwd m1, m4, m5 + pmaddwd m1, m1 + shufps m4, m5, q2121 + paddw m0, m4 ; h sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m3, m1 + paddd m2, m5 ; h sumsq + paddd m3, m4 + paddw m1, m0, [t1+r10+400*0] + paddd m4, m2, [t1+r10+400*2] + paddd m5, m3, [t1+r10+400*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddw m1, [t2+r10+400*0] ; hv sum + paddd m4, [t2+r10+400*2] ; hv sumsq + paddd m5, [t2+r10+400*4] + mova [t0+r10+400*0], m0 + mova [t0+r10+400*2], m2 + mova [t0+r10+400*4], m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + pmulld m4, m9 ; a * 25 + pmulld m5, m9 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + pmaxud m5, m3 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m10 ; p * s + pmulld m5, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 ; x + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+r10+4], m2 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+r10*2+ 8], xm0 + vextracti128 [t3+r10*2+40], m0, 1 + mova [t3+r10*2+24], xm1 + vextracti128 [t3+r10*2+56], m1, 1 + add r10, 32 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10+400*0], m1 + paddw m1, m0 + mova [t1+r10+400*2], m4 + paddd m4, m2 + mova [t1+r10+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-4] +.v_loop: + mova m0, [t1+r10+400*0] + mova m2, [t1+r10+400*2] + mova m3, [t1+r10+400*4] + paddw m1, m0, [t2+r10+400*0] + paddd m4, m2, [t2+r10+400*2] + paddd m5, m3, [t2+r10+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + punpcklwd m2, m3, m6 + psrld m5, 4 + punpckhwd m3, m6 + pmulld m4, m9 ; a * 25 + pmulld m5, m9 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + pmaxud m5, m3 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m10 ; p * s + pmulld m5, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 ; x + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+r10+4], m2 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+r10*2+ 8], xm0 + vextracti128 [t3+r10*2+40], m0, 1 + mova [t3+r10*2+24], xm1 + vextracti128 [t3+r10*2+56], m1, 1 + add r10, 32 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+36] + paddw m3, m0, [t4+r10*1+ 0] + paddd m4, m1, [t3+r10*2+ 0] + paddd m5, m2, [t3+r10*2+32] + paddw m3, [t4+r10*1+ 4] + paddd m4, [t3+r10*2+ 8] + paddd m5, [t3+r10*2+40] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + mova [t4+r10*1+400*2+ 0], m0 + mova [t3+r10*2+400*4+ 0], m1 + mova [t3+r10*2+400*4+32], m2 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+36] + paddw m3, m0, [t4+r10*1+ 0] + paddd m4, m1, [t3+r10*2+ 0] + paddd m5, m2, [t3+r10*2+32] + paddw m3, [t4+r10*1+ 4] + paddd m4, [t3+r10*2+ 8] + paddd m5, [t3+r10*2+40] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + paddw m3, m0, [t4+r10*1+400*2+ 0] + paddd m4, m1, [t3+r10*2+400*4+ 0] + paddd m5, m2, [t3+r10*2+400*4+32] + mova [t4+r10*1+400*2+ 0], m0 + mova [t3+r10*2+400*4+ 0], m1 + mova [t3+r10*2+400*4+32], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m4, m3 + psrad m1, 9 + psrad m4, 9 + packssdw m1, m4 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 32 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m0, [dstq+r10] + mova m3, [t4+r10*1+400*2+ 0] + mova m4, [t3+r10*2+400*4+ 0] + mova m5, [t3+r10*2+400*4+32] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + psubd m1, m2 ; b - a * src + (1 << 7) + psubd m4, m3 + psrad m1, 8 + psrad m4, 8 + packssdw m1, m4 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 32 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_3x3_16bpc, 4, 14, 14, 400*42+8, dst, stride, left, lpf, \ + w, h, edge, params + movifnidn wd, wm + mov paramsq, r6mp + lea r13, [sgr_x_by_x_avx2+256*4] + add wd, wd + movifnidn hd, hm + mov edged, r7m + add lpfq, wq + vpbroadcastw m7, [paramsq+10] ; w1 + add dstq, wq + vpbroadcastd m9, [paramsq+ 4] ; s1 + lea t3, [rsp+wq*2+400*12+8] + vpbroadcastd m8, [pd_8] + lea t4, [rsp+wq+400*32+8] + vpbroadcastd m10, [pd_0xf00801c7] + lea t1, [rsp+wq+12] + vpbroadcastd m11, [pd_34816] + neg wq + mova xm12, [sgr_lshuf3] + pxor m6, m6 + vpbroadcastd m13, [pw_1023] + psllw m7, 4 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+400*6] +.top_fixup_loop: + mova m0, [t1+r10+400*0] + mova m1, [t1+r10+400*2] + mova m2, [t1+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m1 + mova [t2+r10+400*4], m2 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.extend_right: + vpbroadcastw m0, [lpfq-2] + movu m1, [r13+r10+ 2] + movu m2, [r13+r10+18] + vpblendvb m4, m0, m1 + vpblendvb m5, m0, m2 + ret +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm12 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10+ 0] +.h_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + mova [t1+r10+400*0], m1 + mova [t1+r10+400*2], m2 + mova [t1+r10+400*4], m3 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .hv0_main +.hv0_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm12 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m4, [lpfq+r10+ 0] +.hv0_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -34 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + paddw m0, m1, [t1+r10+400*0] + paddd m4, m2, [t1+r10+400*2] + paddd m5, m3, [t1+r10+400*4] + mova [t1+r10+400*0], m1 + mova [t1+r10+400*2], m2 + mova [t1+r10+400*4], m3 + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 ; x + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m2 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 12 + jmp .hv1_main +.hv1_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, xm12 + vinserti128 m4, [lpfq+wq+12], 1 + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m4, [lpfq+r10+ 0] +.hv1_main: + movu m5, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -34 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + palignr m1, m5, m4, 2 + paddw m0, m4, m1 + punpcklwd m2, m4, m1 + pmaddwd m2, m2 + punpckhwd m3, m4, m1 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m0, m5 ; h sum + punpcklwd m1, m5, m6 + pmaddwd m1, m1 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m1 ; h sumsq + paddd m3, m5 + paddw m1, m0, [t2+r10+400*0] + paddd m4, m2, [t2+r10+400*2] + paddd m5, m3, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m2 + mova [t2+r10+400*4], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + psrld m5, 4 + pslld m2, m4, 3 + pslld m3, m5, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 ; x + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2 +4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) + lea r10, [wq-4] +.v0_loop: + mova m0, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 ; x + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m2 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m0, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + paddw m1, m0, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + mova [t2+r10+400*0], m0 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaxud m4, m2 + psubd m4, m2 ; p + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m9 ; p * s + pmulld m5, m9 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 ; x + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + mova xm0, [t4+r10*1+400*0+0] + paddw xm0, [t4+r10*1+400*0+4] + paddw xm2, xm0, [t4+r10*1+400*0+2] + mova m1, [t3+r10*2+400*0+0] + paddd m1, [t3+r10*2+400*0+8] + paddd m3, m1, [t3+r10*2+400*0+4] + psllw xm2, 2 ; a[-1] 444 + pslld m3, 2 ; b[-1] 444 + psubw xm2, xm0 ; a[-1] 343 + psubd m3, m1 ; b[-1] 343 + mova [t4+r10*1+400* 4], xm2 + mova [t3+r10*2+400* 8], m3 + mova xm0, [t4+r10*1+400*2+0] + paddw xm0, [t4+r10*1+400*2+4] + paddw xm2, xm0, [t4+r10*1+400*2+2] + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m3, m1, [t3+r10*2+400*4+4] + psllw xm2, 2 ; a[ 0] 444 + pslld m3, 2 ; b[ 0] 444 + mova [t4+r10*1+400* 6], xm2 + mova [t3+r10*2+400*12], m3 + psubw xm2, xm0 ; a[ 0] 343 + psubd m3, m1 ; b[ 0] 343 + mova [t4+r10*1+400* 8], xm2 + mova [t3+r10*2+400*16], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + mova m3, [t4+r10*1+400*0+0] + paddw m3, [t4+r10*1+400*0+4] + paddw m1, m3, [t4+r10*1+400*0+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+400*4] + paddw m3, [t4+r10*1+400*6] + mova [t4+r10*1+400*4], m2 + mova [t4+r10*1+400*6], m1 + mova m4, [t3+r10*2+400*0+0] + paddd m4, [t3+r10*2+400*0+8] + paddd m1, m4, [t3+r10*2+400*0+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+r10*2+400* 8+ 0] + paddd m4, [t3+r10*2+400*12+ 0] + mova [t3+r10*2+400* 8+ 0], m2 + mova [t3+r10*2+400*12+ 0], m1 + mova m5, [t3+r10*2+400*0+32] + paddd m5, [t3+r10*2+400*0+40] + paddd m1, m5, [t3+r10*2+400*0+36] + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+r10*2+400* 8+32] + paddd m5, [t3+r10*2+400*12+32] + mova [t3+r10*2+400* 8+32], m2 + mova [t3+r10*2+400*12+32], m1 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m4, m3 + psrad m1, 9 + psrad m4, 9 + packssdw m1, m4 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+r10], m0 + add r10, 32 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m3, [t4+r10*1+400*2+0] + paddw m3, [t4+r10*1+400*2+4] + paddw m1, m3, [t4+r10*1+400*2+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+400*6] + paddw m3, [t4+r10*1+400*8] + mova [t4+r10*1+400*6], m1 + mova [t4+r10*1+400*8], m2 + mova m4, [t3+r10*2+400*4+0] + paddd m4, [t3+r10*2+400*4+8] + paddd m1, m4, [t3+r10*2+400*4+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+r10*2+400*12+ 0] + paddd m4, [t3+r10*2+400*16+ 0] + mova [t3+r10*2+400*12+ 0], m1 + mova [t3+r10*2+400*16+ 0], m2 + mova m5, [t3+r10*2+400*4+32] + paddd m5, [t3+r10*2+400*4+40] + paddd m1, m5, [t3+r10*2+400*4+36] + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+r10*2+400*12+32] + paddd m5, [t3+r10*2+400*16+32] + mova [t3+r10*2+400*12+32], m1 + mova [t3+r10*2+400*16+32], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vinserti128 m1, m4, xm5, 1 + vperm2i128 m4, m5, 0x31 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m4, m3 + psrad m1, 9 + psrad m4, 9 + packssdw m1, m4 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+r10], m0 + add r10, 32 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \ + w, h, edge, params + movifnidn wd, wm + mov paramsq, r6mp + lea r13, [sgr_x_by_x_avx2+256*4] + add wd, wd + movifnidn hd, hm + mov edged, r7m + add lpfq, wq + vpbroadcastd m15, [paramsq+8] ; w0 w1 + add dstq, wq + vpbroadcastd m13, [paramsq+0] ; s0 + lea t3, [rsp+wq*2+400*24+8] + vpbroadcastd m14, [paramsq+4] ; s1 + lea t4, [rsp+wq+400*52+8] + vpbroadcastd m9, [pd_8] + lea t1, [rsp+wq+12] + vpbroadcastd m10, [pd_34816] + neg wq + vpbroadcastd m11, [pd_4096] + pxor m7, m7 + vpbroadcastd m12, [pd_0xf00801c7] + psllw m15, 2 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup + add t1, 400*12 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+400*12] +.top_fixup_loop: + mova m0, [t1+r10+400* 0] + mova m1, [t1+r10+400* 2] + mova m2, [t1+r10+400* 4] + paddw m0, m0 + mova m3, [t1+r10+400* 6] + paddd m1, m1 + mova m4, [t1+r10+400* 8] + paddd m2, m2 + mova m5, [t1+r10+400*10] + mova [t2+r10+400* 0], m0 + mova [t2+r10+400* 2], m1 + mova [t2+r10+400* 4], m2 + mova [t2+r10+400* 6], m3 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10- 2] +.h_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -36 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.h_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; sum3 + punpcklwd m6, m0, m7 + pmaddwd m6, m6 + punpckhwd m0, m7 + pmaddwd m0, m0 + paddd m2, m6 ; sumsq3 + shufpd m6, m4, m5, 0x05 + punpcklwd m5, m6, m4 + paddw m8, m4, m6 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + paddd m3, m0 + mova [t1+r10+400* 6], m1 + mova [t1+r10+400* 8], m2 + mova [t1+r10+400*10], m3 + paddw m8, m1 ; sum5 + paddd m5, m2 ; sumsq5 + paddd m6, m3 + mova [t1+r10+400* 0], m8 + mova [t1+r10+400* 2], m5 + mova [t1+r10+400* 4], m6 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv0_main +.hv0_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m4, [lpfq+r10- 2] +.hv0_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -36 + jl .hv0_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.hv0_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; h sum3 + punpcklwd m6, m0, m7 + pmaddwd m6, m6 + punpckhwd m0, m7 + pmaddwd m0, m0 + paddd m2, m6 ; h sumsq3 + shufpd m6, m4, m5, 0x05 + punpcklwd m5, m6, m4 + paddw m8, m4, m6 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + paddd m3, m0 + paddw m8, m1 ; h sum5 + paddd m5, m2 ; h sumsq5 + paddd m6, m3 + mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4? + mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd + mova [t3+r10*2+400*0+40], m6 + paddw m8, [t1+r10+400* 0] + paddd m5, [t1+r10+400* 2] + paddd m6, [t1+r10+400* 4] + mova [t1+r10+400* 0], m8 + mova [t1+r10+400* 2], m5 + mova [t1+r10+400* 4], m6 + paddw m0, m1, [t1+r10+400* 6] + paddd m4, m2, [t1+r10+400* 8] + paddd m5, m3, [t1+r10+400*10] + mova [t1+r10+400* 6], m1 + mova [t1+r10+400* 8], m2 + mova [t1+r10+400*10], m3 + paddw m1, m0, [t2+r10+400* 6] + paddd m2, m4, [t2+r10+400* 8] + paddd m3, m5, [t2+r10+400*10] + mova [t2+r10+400* 6], m0 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 ; x3 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastq xm5, [leftq] + vinserti128 m5, [lpfq+wq], 1 + mova m4, [lpfq+wq] + add leftq, 8 + palignr m4, m5, 10 + jmp .hv1_main +.hv1_extend_left: + mova xm4, [lpfq+wq] + pshufb xm4, [sgr_lshuf5] + vinserti128 m4, [lpfq+wq+10], 1 + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m4, [lpfq+r10- 2] +.hv1_main: + movu m5, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -36 + jl .hv1_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right +.hv1_have_right: + palignr m6, m5, m4, 2 + palignr m3, m5, m4, 4 + paddw m2, m6, m3 + punpcklwd m0, m6, m3 + pmaddwd m0, m0 + punpckhwd m6, m3 + pmaddwd m6, m6 + palignr m3, m5, m4, 6 + paddw m2, m3 ; h sum3 + punpcklwd m1, m3, m7 + pmaddwd m1, m1 + punpckhwd m3, m7 + pmaddwd m3, m3 + paddd m0, m1 ; h sumsq3 + shufpd m1, m4, m5, 0x05 + punpckhwd m5, m4, m1 + paddw m8, m4, m1 + pmaddwd m5, m5 + punpcklwd m4, m1 + pmaddwd m4, m4 + paddd m6, m3 + paddw m1, m2, [t2+r10+400* 6] + mova [t2+r10+400* 6], m2 + paddw m8, m2 ; h sum5 + paddd m2, m0, [t2+r10+400* 8] + paddd m3, m6, [t2+r10+400*10] + mova [t2+r10+400* 8], m0 + mova [t2+r10+400*10], m6 + paddd m4, m0 ; h sumsq5 + paddd m5, m6 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m0, m2, 3 + pslld m6, m3, 3 + paddd m2, m0 ; ((a3 + 8) >> 4) * 9 + paddd m3, m6 + psrlw m6, m1, 1 + pavgw m6, m7 ; (b3 + 2) >> 2 + punpcklwd m0, m6, m7 + pmaddwd m0, m0 + punpckhwd m6, m7 + pmaddwd m6, m6 + pmaxud m2, m0 + psubd m2, m0 ; p3 + pmaxud m3, m6 + psubd m3, m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmulld m2, m14 ; p3 * s1 + pmulld m3, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m2, m12 + paddusw m3, m12 + psrad m7, m2, 20 ; min(z3, 255) - 256 + vpgatherdd m6, [r13+m7*4], m2 ; x3 + psrad m2, m3, 20 + vpgatherdd m7, [r13+m2*4], m3 + pmulld m0, m6 + packssdw m6, m7 + pmulld m7, m1 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m7, m10 + psrld m0, 12 + psrld m7, 12 + paddw m1, m8, [t2+r10+400*0] + paddd m2, m4, [t2+r10+400*2] + paddd m3, m5, [t2+r10+400*4] + paddw m1, [t1+r10+400*0] + paddd m2, [t1+r10+400*2] + paddd m3, [t1+r10+400*4] + mova [t2+r10+400*0], m8 + mova [t2+r10+400*2], m4 + mova [t2+r10+400*4], m5 + mova [t4+r10*1+400*4 +4], m6 + mova [t3+r10*2+400*8+ 8], xm0 + vextracti128 [t3+r10*2+400*8+40], m0, 1 + mova [t3+r10*2+400*8+24], xm7 + vextracti128 [t3+r10*2+400*8+56], m7, 1 + vpbroadcastd m4, [pd_25] + pxor m7, m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 + pmulld m3, m4 + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmaxud m2, m4 + psubd m2, m4 ; p5 + vpbroadcastd m4, [pd_0xf00800a4] + pmaxud m3, m5 + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r13+m5*4], m2 ; x5 + psrad m2, m3, 20 + vpgatherdd m5, [r13+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + packssdw m4, m5 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m4 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-4] +.v0_loop: + mova m0, [t1+r10+400* 6] + mova m4, [t1+r10+400* 8] + mova m5, [t1+r10+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10+400* 6] + paddd m2, m4, [t2+r10+400* 8] + paddd m3, m5, [t2+r10+400*10] + mova [t2+r10+400* 6], m0 + mova [t2+r10+400* 8], m4 + mova [t2+r10+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 ; x3 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psrld m0, 12 + psrld m1, 12 + mova m3, [t1+r10+400*0] + mova m4, [t1+r10+400*2] + mova m5, [t1+r10+400*4] + mova [t3+r10*2+400*8+ 8], m3 + mova [t3+r10*2+400*0+ 8], m4 + mova [t3+r10*2+400*0+40], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+r10+400*0], m3 + mova [t1+r10+400*2], m4 + mova [t1+r10+400*4], m5 + mova [t4+r10*1+400*2+ 4], m2 + mova [t3+r10*2+400*4+ 8], xm0 + vextracti128 [t3+r10*2+400*4+40], m0, 1 + mova [t3+r10*2+400*4+24], xm1 + vextracti128 [t3+r10*2+400*4+56], m1, 1 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m4, [t1+r10+400* 6] + mova m5, [t1+r10+400* 8] + mova m6, [t1+r10+400*10] + paddw m1, m4, [t2+r10+400* 6] + paddd m2, m5, [t2+r10+400* 8] + paddd m3, m6, [t2+r10+400*10] + mova [t2+r10+400* 6], m4 + mova [t2+r10+400* 8], m5 + mova [t2+r10+400*10], m6 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pmaxud m4, m2 + psubd m4, m2 ; p3 + pmaxud m5, m3 + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m12 ; b3 * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r13+m3*4], m4 ; x3 + psrad m4, m5, 20 + vpgatherdd m3, [r13+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + packssdw m2, m3 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + psrld m0, 12 + psrld m8, m1, 12 + mova [t4+r10*1+400*4+4], m2 + mova m4, [t3+r10*2+400*8+ 8] + mova m5, [t3+r10*2+400*0+ 8] + mova m6, [t3+r10*2+400*0+40] + paddw m1, m4, [t2+r10+400*0] + paddd m2, m5, [t2+r10+400*2] + paddd m3, m6, [t2+r10+400*4] + paddw m1, [t1+r10+400*0] + paddd m2, [t1+r10+400*2] + paddd m3, [t1+r10+400*4] + mova [t2+r10+400*0], m4 + mova [t2+r10+400*2], m5 + mova [t2+r10+400*4], m6 + vpbroadcastd m4, [pd_25] + mova [t3+r10*2+400*8+ 8], xm0 + vextracti128 [t3+r10*2+400*8+40], m0, 1 + mova [t3+r10*2+400*8+24], xm8 + vextracti128 [t3+r10*2+400*8+56], m8, 1 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 + pmulld m3, m4 + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmaxud m2, m4 + psubd m2, m4 ; p5 + vpbroadcastd m4, [pd_0xf00800a4] + pmaxud m3, m5 + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r13+m5*4], m2 ; x5 + psrad m2, m3, 20 + vpgatherdd m5, [r13+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + packssdw m4, m5 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + psrld m0, 12 + psrld m1, 12 + mova [t4+r10*1+400*0+ 4], m4 + mova [t3+r10*2+400*0+ 8], xm0 + vextracti128 [t3+r10*2+400*0+40], m0, 1 + mova [t3+r10*2+400*0+24], xm1 + vextracti128 [t3+r10*2+400*0+56], m1, 1 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu xm0, [t4+r10*1+400*0+2] + paddw xm2, xm0, [t4+r10*1+400*0+0] + paddw xm2, [t4+r10*1+400*0+4] + movu m1, [t3+r10*2+400*0+4] + paddd m3, m1, [t3+r10*2+400*0+0] + paddd m3, [t3+r10*2+400*0+8] + paddw xm0, xm2 + paddd m1, m3 + psllw xm2, 2 + pslld m3, 2 + paddw xm0, xm2 ; a5 565 + paddd m1, m3 ; b5 565 + mova [t4+r10*1+400* 6], xm0 + mova [t3+r10*2+400*12], m1 + mova xm0, [t4+r10*1+400*2+0] + paddw xm0, [t4+r10*1+400*2+4] + paddw xm2, xm0, [t4+r10*1+400*2+2] + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m3, m1, [t3+r10*2+400*4+4] + psllw xm2, 2 ; a3[-1] 444 + pslld m3, 2 ; b3[-1] 444 + psubw xm2, xm0 ; a3[-1] 343 + psubd m3, m1 ; b3[-1] 343 + mova [t4+r10*1+400* 8], xm2 + mova [t3+r10*2+400*16], m3 + mova xm0, [t4+r10*1+400*4+0] + paddw xm0, [t4+r10*1+400*4+4] + paddw xm2, xm0, [t4+r10*1+400*4+2] + mova m1, [t3+r10*2+400*8+0] + paddd m1, [t3+r10*2+400*8+8] + paddd m3, m1, [t3+r10*2+400*8+4] + psllw xm2, 2 ; a3[ 0] 444 + pslld m3, 2 ; b3[ 0] 444 + mova [t4+r10*1+400*10], xm2 + mova [t3+r10*2+400*20], m3 + psubw xm2, xm0 ; a3[ 0] 343 + psubd m3, m1 ; b3[ 0] 343 + mova [t4+r10*1+400*12], xm2 + mova [t3+r10*2+400*24], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu xm2, [t4+r10*1+2] + paddw xm0, xm2, [t4+r10*1+0] + paddw xm0, [t4+r10*1+4] + paddw xm2, xm0 + psllw xm0, 2 + paddw xm0, xm2 ; a5 + movu m1, [t3+r10*2+4] + paddd m4, m1, [t3+r10*2+0] + paddd m4, [t3+r10*2+8] + paddd m1, m4 + pslld m4, 2 + paddd m4, m1 ; b5 + paddw xm2, xm0, [t4+r10*1+400* 6] + mova [t4+r10*1+400* 6], xm0 + paddd m0, m4, [t3+r10*2+400*12] + mova [t3+r10*2+400*12], m4 + mova xm3, [t4+r10*1+400*2+0] + paddw xm3, [t4+r10*1+400*2+4] + paddw xm5, xm3, [t4+r10*1+400*2+2] + psllw xm5, 2 ; a3[ 1] 444 + psubw xm4, xm5, xm3 ; a3[ 1] 343 + paddw xm3, xm4, [t4+r10*1+400* 8] + paddw xm3, [t4+r10*1+400*10] + mova [t4+r10*1+400* 8], xm4 + mova [t4+r10*1+400*10], xm5 + mova m1, [t3+r10*2+400*4+0] + paddd m1, [t3+r10*2+400*4+8] + paddd m5, m1, [t3+r10*2+400*4+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m1 ; b3[ 1] 343 + paddd m1, m4, [t3+r10*2+400*16] + paddd m1, [t3+r10*2+400*20] + mova [t3+r10*2+400*16], m4 + mova [t3+r10*2+400*20], m5 + pmovzxwd m4, [dstq+r10] + pmovzxwd m2, xm2 ; a5 + pmovzxwd m3, xm3 ; a3 + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + pslld m4, 13 + psubd m0, m2 ; b5 - a5 * src + (1 << 8) + psubd m1, m3 ; b3 - a3 * src + (1 << 8) + psrld m0, 9 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + paddd m4, m11 + paddd m0, m4 + psrad m0, 7 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 ; clip + psrlw xm0, 6 + mova [dstq+r10], xm0 + add r10, 16 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova xm3, [t4+r10*1+400*4+0] + paddw xm3, [t4+r10*1+400*4+4] + paddw xm5, xm3, [t4+r10*1+400*4+2] + psllw xm5, 2 ; a3[ 1] 444 + psubw xm4, xm5, xm3 ; a3[ 1] 343 + paddw xm3, xm4, [t4+r10*1+400*12] + paddw xm3, [t4+r10*1+400*10] + mova [t4+r10*1+400*10], xm5 + mova [t4+r10*1+400*12], xm4 + mova m1, [t3+r10*2+400*8+0] + paddd m1, [t3+r10*2+400*8+8] + paddd m5, m1, [t3+r10*2+400*8+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m1 ; b3[ 1] 343 + paddd m1, m4, [t3+r10*2+400*24] + paddd m1, [t3+r10*2+400*20] + mova [t3+r10*2+400*20], m5 + mova [t3+r10*2+400*24], m4 + pmovzxwd m4, [dstq+r10] + pmovzxwd m2, [t4+r10*1+400* 6] + pmovzxwd m3, xm3 + mova m0, [t3+r10*2+400*12] + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + pslld m4, 13 + psubd m0, m2 ; b5 - a5 * src + (1 << 8) + psubd m1, m3 ; b3 - a3 * src + (1 << 8) + psrld m0, 8 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + paddd m4, m11 + paddd m0, m4 + psrad m0, 7 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 ; clip + psrlw xm0, 6 + mova [dstq+r10], xm0 + add r10, 16 + jl .n1_loop + add dstq, strideq + ret + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/looprestoration16_avx512.asm b/third_party/dav1d/src/x86/looprestoration16_avx512.asm new file mode 100644 index 0000000000..e560c54a40 --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration16_avx512.asm @@ -0,0 +1,2524 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 + +wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 +wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 +wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +r_ext_mask: times 72 db -1 + times 8 db 0 +wiener_hshift: dw 4, 4, 1, 1 +wiener_vshift: dw 1024, 1024, 4096, 4096 +wiener_round: dd 1049600, 1048832 + +pw_164_455: dw 164, 455 +pw_1023: times 2 dw 1023 +pw_61448: times 2 dw 61448 +pd_m262128: dd -262128 +pd_m34816: dd -34816 +pd_m25: dd -25 +pd_m9: dd -9 +pd_8: dd 8 +pd_2147483648: dd 2147483648 + +cextern sgr_x_by_x + +SECTION .text + +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers + +INIT_ZMM avx512icl +cglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt +%define base t4-wiener_hshift + mov fltq, r6mp + movifnidn wd, wm + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max + vbroadcasti128 m6, [wiener_shufA] + vpbroadcastd m12, [fltq+ 0] ; x0 x1 + lea t4, [wiener_hshift] + vbroadcasti128 m7, [wiener_shufB] + add wd, wd + vpbroadcastd m13, [fltq+ 4] ; x2 x3 + shr t3d, 11 + vpbroadcastd m14, [fltq+16] ; y0 y1 + add lpfq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + add dstq, wq + vbroadcasti128 m8, [wiener_shufC] + lea t1, [rsp+wq+16] + vbroadcasti128 m9, [wiener_shufD] + neg wq + vpbroadcastd m0, [base+wiener_hshift+t3*4] + mov r10d, 0xfe + vpbroadcastd m10, [base+wiener_round+t3*4] + kmovb k1, r10d + vpbroadcastd m11, [base+wiener_vshift+t3*4] + pmullw m12, m0 ; upshift filter coefs to make the + vpbroadcastd m16, [pd_m262128] + pmullw m13, m0 ; horizontal downshift constant + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq xm3, [leftq] + vmovdqu64 m3{k1}, [lpfq+r10-8] + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m4, [lpfq+r10+0] + vpbroadcastw xm3, xm4 + vmovdqu64 m3{k1}, [lpfq+r10-8] + jmp .h_main2 +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+r10-8] +.h_main: + mova m4, [lpfq+r10+0] +.h_main2: + movu m5, [lpfq+r10+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -68 + jl .h_have_right + push r0 + lea r0, [r_ext_mask+66] + vpbroadcastw m0, [lpfq-2] + vpternlogd m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b + vpternlogd m4, m0, [r0+r10+ 8], 0xe4 + vpternlogd m5, m0, [r0+r10+16], 0xe4 + pop r0 +.h_have_right: + pshufb m2, m3, m6 + pshufb m1, m4, m7 + paddw m2, m1 + pshufb m3, m8 + mova m0, m16 + vpdpwssd m0, m2, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + vpdpwssd m0, m3, m13 + pshufb m2, m5, m7 + paddw m2, m1 + mova m1, m16 + pshufb m4, m8 + vpdpwssd m1, m2, m12 + pshufb m5, m9 + paddw m4, m5 + vpdpwssd m1, m4, m13 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+r10], m0 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq xm3, [leftq] + vmovdqu64 m3{k1}, [lpfq+r10-8] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m4, [lpfq+r10+0] + vpbroadcastw xm3, xm4 + vmovdqu64 m3{k1}, [lpfq+r10-8] + jmp .hv_main2 +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+r10-8] +.hv_main: + mova m4, [lpfq+r10+0] +.hv_main2: + movu m5, [lpfq+r10+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -68 + jl .hv_have_right + push r0 + lea r0, [r_ext_mask+66] + vpbroadcastw m0, [lpfq-2] + vpternlogd m3, m0, [r0+r10+ 0], 0xe4 + vpternlogd m4, m0, [r0+r10+ 8], 0xe4 + vpternlogd m5, m0, [r0+r10+16], 0xe4 + pop r0 +.hv_have_right: + pshufb m2, m3, m6 + pshufb m1, m4, m7 + paddw m2, m1 + pshufb m3, m8 + mova m0, m16 + vpdpwssd m0, m2, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + vpdpwssd m0, m3, m13 + pshufb m2, m5, m7 + paddw m2, m1 + pshufb m4, m8 + mova m1, m16 + vpdpwssd m1, m2, m12 + pshufb m5, m9 + paddw m4, m5 + vpdpwssd m1, m4, m13 + mova m2, [t4+r10] + paddw m2, [t2+r10] + mova m5, [t3+r10] + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova m4, [t5+r10] + paddw m4, [t1+r10] + psraw m0, 1 + paddw m3, m0, [t6+r10] + mova [t0+r10], m0 + punpcklwd m1, m2, m5 + mova m0, m10 + vpdpwssd m0, m1, m15 + punpckhwd m2, m5 + mova m1, m10 + vpdpwssd m1, m2, m15 + punpcklwd m2, m3, m4 + vpdpwssd m0, m2, m14 + punpckhwd m3, m4 + vpdpwssd m1, m3, m14 + psrad m0, 5 + psrad m1, 5 + packusdw m0, m1 + pmulhuw m0, m11 + mova [dstq+r10], m0 + add r10, 64 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m2, [t4+r10] + paddw m2, [t2+r10] + mova m3, [t3+r10] + punpcklwd m1, m2, m3 + mova m0, m10 + vpdpwssd m0, m1, m15 + punpckhwd m2, m3 + mova m1, m10 + vpdpwssd m1, m2, m15 + mova m4, [t1+r10] + paddw m3, m4, [t6+r10] + paddw m4, [t5+r10] + punpcklwd m2, m3, m4 + vpdpwssd m0, m2, m14 + punpckhwd m3, m4 + vpdpwssd m1, m3, m14 + psrad m0, 5 + psrad m1, 5 + packusdw m0, m1 + pmulhuw m0, m11 + mova [dstq+r10], m0 + add r10, 64 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret + +cglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt +%define base r13-r_ext_mask-70 + mov fltq, r6mp + movifnidn wd, wm + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max + vbroadcasti128 m5, [wiener_shufE] + vpbroadcastw m11, [fltq+ 2] ; x1 + vbroadcasti128 m6, [wiener_shufB] + lea r13, [r_ext_mask+70] + vbroadcasti128 m7, [wiener_shufD] + add wd, wd + vpbroadcastd m12, [fltq+ 4] ; x2 x3 + shr t3d, 11 + vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) + add lpfq, wq + vpbroadcastw m13, [fltq+18] ; y1 + add dstq, wq + vpbroadcastd m14, [fltq+20] ; y2 y3 + lea t1, [rsp+wq+16] + vpbroadcastd m0, [base+wiener_hshift+t3*4] + neg wq + vpbroadcastd m9, [base+wiener_round+t3*4] + mov r10d, 0xfffe + vpbroadcastd m10, [base+wiener_vshift+t3*4] + kmovw k1, r10d + pmullw m11, m0 + pmullw m12, m0 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.end: + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq +.v1: + call .v + jmp .end +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm3, [leftq+4] + vmovdqu32 m3{k1}, [lpfq+r10-4] + add leftq, 8 + jmp .h_main +.h_extend_left: + vpbroadcastw xm3, [lpfq+r10] + vmovdqu32 m3{k1}, [lpfq+r10-4] + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+r10-4] +.h_main: + movu m4, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -66 + jl .h_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m3, m0, [r13+r10+0], 0xe4 ; c ? a : b + vpternlogd m4, m0, [r13+r10+8], 0xe4 +.h_have_right: + pshufb m1, m3, m5 + mova m0, m8 + vpdpwssd m0, m1, m11 + pshufb m2, m4, m5 + mova m1, m8 + vpdpwssd m1, m2, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + vpdpwssd m0, m2, m12 + pshufb m4, m7 + paddw m3, m4 + vpdpwssd m1, m3, m12 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+r10], m0 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm3, [leftq+4] + vmovdqu32 m3{k1}, [lpfq+r10-4] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + vpbroadcastw xm3, [lpfq+r10] + vmovdqu32 m3{k1}, [lpfq+r10-4] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+r10-4] +.hv_main: + movu m4, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -66 + jl .hv_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m3, m0, [r13+r10+0], 0xe4 + vpternlogd m4, m0, [r13+r10+8], 0xe4 +.hv_have_right: + pshufb m1, m3, m5 + mova m0, m8 + vpdpwssd m0, m1, m11 + pshufb m2, m4, m5 + mova m1, m8 + vpdpwssd m1, m2, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + vpdpwssd m0, m2, m12 + pshufb m4, m7 + paddw m4, m3 + vpdpwssd m1, m4, m12 + mova m2, [t3+r10] + paddw m2, [t1+r10] + mova m3, [t2+r10] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + mova m3, m9 + vpdpwssd m3, m2, m14 + mova m2, m9 + vpdpwssd m2, m4, m14 + mova m4, [t4+r10] + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t0+r10], m0 + punpcklwd m1, m0, m4 + vpdpwssd m2, m1, m13 + punpckhwd m0, m4 + vpdpwssd m3, m0, m13 + psrad m2, 5 + psrad m3, 5 + packusdw m2, m3 + pmulhuw m2, m10 + mova [dstq+r10], m2 + add r10, 64 + jl .hv_loop + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m0, [t1+r10] + paddw m2, m0, [t3+r10] + mova m1, [t2+r10] + mova m4, [t4+r10] + punpckhwd m3, m2, m1 + pmaddwd m3, m14 + punpcklwd m2, m1 + pmaddwd m2, m14 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 5 + psrad m0, 5 + packusdw m0, m1 + pmulhuw m0, m10 + mova [dstq+r10], m0 + add r10, 64 + jl .v_loop + ret + +cglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \ + w, h, edge, params +%define base r13-r_ext_mask-72 + movifnidn wd, wm + mov paramsq, r6mp + lea r13, [r_ext_mask+72] + mov edged, r7m + movifnidn hd, hm + pxor m6, m6 + vpbroadcastw m7, [paramsq+8] ; w0 + add wd, wd + vpbroadcastd m8, [base+pd_8] + add lpfq, wq + vpbroadcastd m9, [base+pd_m25] + add dstq, wq + vpsubd m10, m6, [paramsq+0] {1to16} ; -s0 + lea t3, [rsp+wq*2+416*12+8] + vpbroadcastd m11, [base+pw_164_455] + lea t4, [rsp+wq+416*20+8] + vpbroadcastd m12, [base+pw_61448] ; (15 << 12) + (1 << 3) + lea t1, [rsp+wq+12] + vpbroadcastd m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15)) + neg wq + vpbroadcastd m14, [base+pw_1023] + psllw m7, 4 + mova m18, [sgr_x_by_x+64*0] + mov r10d, 0xfffffff8 + mova m19, [sgr_x_by_x+64*1] + kmovd k1, r10d + mova m20, [sgr_x_by_x+64*2] + mov r10, 0x3333333333333333 + mova m21, [sgr_x_by_x+64*3] + kmovq k2, r10 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call .top_fixup + add t1, 416*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + test hd, hd + jz .odd_height + call .h + add lpfq, strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .h_top + add lpfq, strideq + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+416*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq xm16, [leftq+2] + vmovdqu16 m16{k1}, [lpfq+wq-6] + add leftq, 8 + jmp .h_main +.h_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-6] + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m16, [lpfq+r10- 2] +.h_main: + movu m17, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -68 + jl .h_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.h_have_right: + palignr m2, m17, m16, 2 + paddw m0, m16, m2 + palignr m3, m17, m16, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + shufpd m17, m16, m17, 0x55 + paddw m0, m17 + punpcklwd m3, m16, m17 + vpdpwssd m1, m3, m3 + punpckhwd m3, m16, m17 + vpdpwssd m2, m3, m3 + shufps m16, m17, q2121 + paddw m0, m16 ; sum + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10+416*0] + paddd m1, [t1+r10+416*2] + paddd m2, [t1+r10+416*4] +.h_loop_end: + punpcklwd m17, m16, m6 + vpdpwssd m1, m17, m17 ; sumsq + punpckhwd m16, m6 + vpdpwssd m2, m16, m16 + mova [t1+r10+416*0], m0 + mova [t1+r10+416*2], m1 + mova [t1+r10+416*4], m2 + add r10, 64 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-4] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10+416*0] + mova m1, [t1+r10+416*2] + mova m2, [t1+r10+416*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m1 + mova [t2+r10+416*4], m2 + add r10, 64 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq xm16, [leftq+2] + vmovdqu16 m16{k1}, [lpfq+wq-6] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-6] + jmp .hv_main +.hv_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m16, [lpfq+r10- 2] +.hv_main: + movu m17, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -68 + jl .hv_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.hv_have_right: + palignr m3, m17, m16, 2 + paddw m0, m16, m3 + palignr m1, m17, m16, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + shufpd m17, m16, m17, 0x55 + paddw m0, m17 + punpcklwd m1, m16, m17 + vpdpwssd m2, m1, m1 + punpckhwd m1, m16, m17 + vpdpwssd m3, m1, m1 + shufps m16, m17, q2121 + paddw m0, m16 ; h sum + punpcklwd m17, m16, m6 + vpdpwssd m2, m17, m17 ; h sumsq + punpckhwd m16, m6 + vpdpwssd m3, m16, m16 + paddw m1, m0, [t1+r10+416*0] + paddd m16, m2, [t1+r10+416*2] + paddd m17, m3, [t1+r10+416*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddw m1, [t2+r10+416*0] ; hv sum + paddd m16, [t2+r10+416*2] ; hv sumsq + paddd m17, [t2+r10+416*4] + mova [t0+r10+416*0], m0 + mova [t0+r10+416*2], m2 + mova [t0+r10+416*4], m3 + psrlw m3, m1, 1 + paddd m16, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m17, m8 + psrld m16, 4 ; (a + 8) >> 4 + psrld m17, 4 + pmulld m16, m9 ; -a * 25 + pmulld m17, m9 + punpcklwd m2, m3, m6 + vpdpwssd m16, m2, m2 ; -p + punpckhwd m3, m6 + vpdpwssd m17, m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m16, m10 ; p * s + pmulld m17, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + pmaxsw m17, m6 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + packssdw m16, m17 + psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + psubd m1, m13 + mova [t4+r10+4], m16 + psrld m16, m0, 12 ; b + psrld m17, m1, 12 + mova [t3+r10*2+ 8], xm16 + mova [t3+r10*2+ 24], xm17 + vextracti128 [t3+r10*2+ 40], ym16, 1 + vextracti128 [t3+r10*2+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+ 72], m16, 2 + vextracti32x4 [t3+r10*2+ 88], m17, 2 + vextracti32x4 [t3+r10*2+104], m16, 3 + vextracti32x4 [t3+r10*2+120], m17, 3 + add r10, 64 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10+416*0], m1 + paddw m1, m0 + mova [t1+r10+416*2], m16 + paddd m16, m2 + mova [t1+r10+416*4], m17 + paddd m17, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-4] +.v_loop: + mova m2, [t1+r10+416*2] + mova m3, [t1+r10+416*4] + mova m0, [t1+r10+416*0] + paddd m16, m2, [t2+r10+416*2] + paddd m17, m3, [t2+r10+416*4] + paddw m1, m0, [t2+r10+416*0] + paddd m2, m2 + paddd m3, m3 + paddd m16, m2 ; hv sumsq + paddd m17, m3 + paddd m16, m8 + paddd m17, m8 + psrld m16, 4 ; (a + 8) >> 4 + psrld m17, 4 + pmulld m16, m9 ; -a * 25 + pmulld m17, m9 + paddw m0, m0 + paddw m1, m0 ; hv sum + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + vpdpwssd m16, m2, m2 ; -p + punpckhwd m3, m6 + vpdpwssd m17, m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m16, m10 ; p * s + pmulld m17, m10 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + pmaxsw m17, m6 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + packssdw m16, m17 + psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + psubd m1, m13 + mova [t4+r10+4], m16 + psrld m16, m0, 12 ; b + psrld m17, m1, 12 + mova [t3+r10*2+ 8], xm16 + mova [t3+r10*2+ 24], xm17 + vextracti128 [t3+r10*2+ 40], ym16, 1 + vextracti128 [t3+r10*2+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+ 72], m16, 2 + vextracti32x4 [t3+r10*2+ 88], m17, 2 + vextracti32x4 [t3+r10*2+104], m16, 3 + vextracti32x4 [t3+r10*2+120], m17, 3 + add r10, 64 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+68] + paddw m3, m0, [t4+r10*1+ 0] + paddd m16, m1, [t3+r10*2+ 0] + paddd m17, m2, [t3+r10*2+64] + paddw m3, [t4+r10*1+ 4] + paddd m16, [t3+r10*2+ 8] + paddd m17, [t3+r10*2+72] + paddw m0, m3 + psllw m3, 2 + paddd m1, m16 + pslld m16, 2 + paddd m2, m17 + pslld m17, 2 + paddw m0, m3 ; a 565 + paddd m1, m16 ; b 565 + paddd m2, m17 + mova [t4+r10*1+416*2+ 0], m0 + mova [t3+r10*2+416*4+ 0], m1 + mova [t3+r10*2+416*4+64], m2 + add r10, 64 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t4+r10*1+ 2] + movu m1, [t3+r10*2+ 4] + movu m2, [t3+r10*2+68] + paddw m3, m0, [t4+r10*1+ 0] + paddd m16, m1, [t3+r10*2+ 0] + paddd m17, m2, [t3+r10*2+64] + paddw m3, [t4+r10*1+ 4] + paddd m16, [t3+r10*2+ 8] + paddd m17, [t3+r10*2+72] + paddw m0, m3 + psllw m3, 2 + paddd m1, m16 + pslld m16, 2 + paddd m2, m17 + pslld m17, 2 + paddw m0, m3 ; a 565 + paddd m1, m16 ; b 565 + paddd m2, m17 + paddw m3, m0, [t4+r10*1+416*2+ 0] + paddd m16, m1, [t3+r10*2+416*4+ 0] + paddd m17, m2, [t3+r10*2+416*4+64] + mova [t4+r10*1+416*2+ 0], m0 + mova [t3+r10*2+416*4+ 0], m1 + mova [t3+r10*2+416*4+64], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vshufi32x4 m1, m16, m17, q2020 + vshufi32x4 m16, m17, q3131 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m16, m3 + psrad m1, 9 + psrad m16, 9 + packssdw m1, m16 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 64 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m0, [dstq+r10] + mova m3, [t4+r10*1+416*2+ 0] + mova m16, [t3+r10*2+416*4+ 0] + mova m17, [t3+r10*2+416*4+64] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vshufi32x4 m1, m16, m17, q2020 + vshufi32x4 m16, m17, q3131 + psubd m1, m2 ; b - a * src + (1 << 7) + psubd m16, m3 + psrad m1, 8 + psrad m16, 8 + packssdw m1, m16 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 64 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \ + w, h, edge, params + movifnidn wd, wm + mov paramsq, r6mp + lea r13, [r_ext_mask+72] + mov edged, r7m + movifnidn hd, hm + pxor m6, m6 + vpbroadcastw m7, [paramsq+10] ; w1 + add wd, wd + vpbroadcastd m8, [base+pd_8] + add lpfq, wq + vpbroadcastd m9, [base+pd_m9] + add dstq, wq + vpsubd m10, m6, [paramsq+4] {1to16} ; -s1 + lea t3, [rsp+wq*2+416*12+8] + vpbroadcastd m11, [base+pw_164_455] + lea t4, [rsp+wq+416*32+8] + vpbroadcastd m12, [base+pw_61448] + lea t1, [rsp+wq+12] + vpbroadcastd m13, [base+pd_m34816] + neg wq + vpbroadcastd m14, [base+pw_1023] + psllw m7, 4 + mova m18, [sgr_x_by_x+64*0] + mov r10d, 0xfffffffc + mova m19, [sgr_x_by_x+64*1] + kmovd k1, r10d + mova m20, [sgr_x_by_x+64*2] + mov r10, 0x3333333333333333 + mova m21, [sgr_x_by_x+64*3] + kmovq k2, r10 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + add t1, 416*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+416*6] +.top_fixup_loop: + mova m0, [t1+r10+416*0] + mova m1, [t1+r10+416*2] + mova m2, [t1+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m1 + mova [t2+r10+416*4], m2 + add r10, 64 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm16, [leftq+4] + vmovdqu16 m16{k1}, [lpfq+wq-4] + add leftq, 8 + jmp .h_main +.h_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m16, [lpfq+r10+ 0] +.h_main: + movu m17, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -66 + jl .h_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.h_have_right: + palignr m0, m17, m16, 2 + paddw m1, m16, m0 + punpcklwd m2, m16, m0 + pmaddwd m2, m2 + punpckhwd m3, m16, m0 + pmaddwd m3, m3 + palignr m17, m16, 4 + paddw m1, m17 ; sum + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; sumsq + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + mova [t1+r10+416*0], m1 + mova [t1+r10+416*2], m2 + mova [t1+r10+416*4], m3 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movd xm16, [leftq+4] + vmovdqu16 m16{k1}, [lpfq+wq-4] + add leftq, 8 + jmp .hv0_main +.hv0_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-4] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m16, [lpfq+r10+ 0] +.hv0_main: + movu m17, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -66 + jl .hv0_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.hv0_have_right: + palignr m0, m17, m16, 2 + paddw m1, m16, m0 + punpcklwd m2, m16, m0 + pmaddwd m2, m2 + punpckhwd m3, m16, m0 + pmaddwd m3, m3 + palignr m17, m16, 4 + paddw m1, m17 ; sum + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; sumsq + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + paddw m0, m1, [t1+r10+416*0] + paddd m16, m2, [t1+r10+416*2] + paddd m17, m3, [t1+r10+416*4] + mova [t1+r10+416*0], m1 + mova [t1+r10+416*2], m2 + mova [t1+r10+416*4], m3 + paddw m1, m0, [t2+r10+416*0] + paddd m2, m16, [t2+r10+416*2] + paddd m3, m17, [t2+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m16 + mova [t2+r10+416*4], m17 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pmulld m2, m9 ; -((a + 8) >> 4) * 9 + pmulld m3, m9 + psrlw m17, m1, 1 + pavgw m17, m6 ; (b + 2) >> 2 + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; -p + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + punpcklwd m16, m6, m1 ; b + punpckhwd m17, m6, m1 + pminsd m2, m6 + pminsd m3, m6 + pmulld m2, m10 ; p * s + pmulld m3, m10 + pmaddwd m16, m11 ; b * 455 + pmaddwd m17, m11 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + paddusw m3, m12 + psraw m3, 4 ; min(z, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x + pandn m2, m13, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + psubd m17, m13 + mova [t4+r10*1+416*0+4], m2 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*0+ 8], xm16 + mova [t3+r10*2+416*0+ 24], xm17 + vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*0+104], m16, 3 + vextracti32x4 [t3+r10*2+416*0+120], m17, 3 + add r10, 64 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movd xm16, [leftq+4] + vmovdqu16 m16{k1}, [lpfq+wq-4] + add leftq, 8 + jmp .hv1_main +.hv1_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-4] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m16, [lpfq+r10+ 0] +.hv1_main: + movu m17, [lpfq+r10+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -66 + jl .hv1_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.hv1_have_right: + palignr m1, m17, m16, 2 + paddw m0, m16, m1 + punpcklwd m2, m16, m1 + pmaddwd m2, m2 + punpckhwd m3, m16, m1 + pmaddwd m3, m3 + palignr m17, m16, 4 + paddw m0, m17 ; h sum + punpcklwd m1, m17, m6 + vpdpwssd m2, m1, m1 ; h sumsq + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + paddw m1, m0, [t2+r10+416*0] + paddd m16, m2, [t2+r10+416*2] + paddd m17, m3, [t2+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m2 + mova [t2+r10+416*4], m3 + paddd m16, m8 + paddd m17, m8 + psrld m16, 4 ; (a + 8) >> 4 + psrld m17, 4 + pmulld m16, m9 ; -((a + 8) >> 4) * 9 + pmulld m17, m9 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + vpdpwssd m16, m2, m2 ; -p + punpckhwd m3, m6 + vpdpwssd m17, m3, m3 + punpcklwd m0, m6, m1 ; b + punpckhwd m1, m6, m1 + pminsd m16, m6 + pminsd m17, m6 + pmulld m16, m10 ; p * s + pmulld m17, m10 + pmaddwd m0, m11 ; b * 455 + pmaddwd m1, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + packssdw m16, m17 + psubd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + psubd m1, m13 + mova [t4+r10*1+416*2+4], m16 + psrld m16, m0, 12 + psrld m17, m1, 12 + mova [t3+r10*2+416*4+ 8], xm16 + mova [t3+r10*2+416*4+ 24], xm17 + vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*4+104], m16, 3 + vextracti32x4 [t3+r10*2+416*4+120], m17, 3 + add r10, 64 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) + lea r10, [wq-4] +.v0_loop: + mova m0, [t1+r10+416*0] + mova m16, [t1+r10+416*2] + mova m17, [t1+r10+416*4] + paddw m0, m0 + paddd m16, m16 + paddd m17, m17 + paddw m1, m0, [t2+r10+416*0] + paddd m2, m16, [t2+r10+416*2] + paddd m3, m17, [t2+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m16 + mova [t2+r10+416*4], m17 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pmulld m2, m9 ; -((a + 8) >> 4) * 9 + pmulld m3, m9 + psrlw m17, m1, 1 + pavgw m17, m6 ; (b + 2) >> 2 + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; -p + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + punpcklwd m16, m6, m1 ; b + punpckhwd m17, m6, m1 + pminsd m2, m6 + pminsd m3, m6 + pmulld m2, m10 ; p * s + pmulld m3, m10 + pmaddwd m16, m11 ; b * 455 + pmaddwd m17, m11 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + paddusw m3, m12 + psraw m3, 4 ; min(z, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x + pandn m2, m13, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + psubd m17, m13 + mova [t4+r10*1+416*0+4], m2 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*0+ 8], xm16 + mova [t3+r10*2+416*0+ 24], xm17 + vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*0+104], m16, 3 + vextracti32x4 [t3+r10*2+416*0+120], m17, 3 + add r10, 64 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m0, [t1+r10+416*0] + mova m16, [t1+r10+416*2] + mova m17, [t1+r10+416*4] + paddw m1, m0, [t2+r10+416*0] + paddd m2, m16, [t2+r10+416*2] + paddd m3, m17, [t2+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m16 + mova [t2+r10+416*4], m17 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pmulld m2, m9 ; -((a + 8) >> 4) * 9 + pmulld m3, m9 + psrlw m17, m1, 1 + pavgw m17, m6 ; (b + 2) >> 2 + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; -p + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + punpcklwd m16, m6, m1 ; b + punpckhwd m17, m6, m1 + pminsd m2, m6 + pminsd m3, m6 + pmulld m2, m10 ; p * s + pmulld m3, m10 + pmaddwd m16, m11 ; b * 455 + pmaddwd m17, m11 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + paddusw m3, m12 + psraw m3, 4 ; min(z, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x + pandn m2, m13, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + psubd m17, m13 + mova [t4+r10*1+416*2+4], m2 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*4+ 8], xm16 + mova [t3+r10*2+416*4+ 24], xm17 + vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*4+104], m16, 3 + vextracti32x4 [t3+r10*2+416*4+120], m17, 3 + add r10, 64 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + mova ym16, [t4+r10*1+416*0+0] + paddw ym16, [t4+r10*1+416*0+4] + paddw ym17, ym16, [t4+r10*1+416*0+2] + mova m0, [t3+r10*2+416*0+0] + paddd m0, [t3+r10*2+416*0+8] + paddd m1, m0, [t3+r10*2+416*0+4] + psllw ym17, 2 ; a[-1] 444 + pslld m1, 2 ; b[-1] 444 + psubw ym17, ym16 ; a[-1] 343 + psubd m1, m0 ; b[-1] 343 + vmovdqa32 [t4+r10*1+416* 4], ym17 + vmovdqa32 [t3+r10*2+416* 8], m1 + mova ym16, [t4+r10*1+416*2+0] + paddw ym16, [t4+r10*1+416*2+4] + paddw ym17, ym16, [t4+r10*1+416*2+2] + mova m0, [t3+r10*2+416*4+0] + paddd m0, [t3+r10*2+416*4+8] + paddd m1, m0, [t3+r10*2+416*4+4] + psllw ym17, 2 ; a[ 0] 444 + pslld m1, 2 ; b[ 0] 444 + vmovdqa32 [t4+r10*1+416* 6], ym17 + vmovdqa32 [t3+r10*2+416*12], m1 + psubw ym17, ym16 ; a[ 0] 343 + psubd m1, m0 ; b[ 0] 343 + vmovdqa32 [t4+r10*1+416* 8], ym17 + vmovdqa32 [t3+r10*2+416*16], m1 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + mova m3, [t4+r10*1+416*0+0] + paddw m3, [t4+r10*1+416*0+4] + paddw m1, m3, [t4+r10*1+416*0+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+416*4] + paddw m3, [t4+r10*1+416*6] + mova [t4+r10*1+416*4], m2 + mova [t4+r10*1+416*6], m1 + mova m16, [t3+r10*2+416*0+0] + paddd m16, [t3+r10*2+416*0+8] + paddd m1, m16, [t3+r10*2+416*0+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m16 ; b[ 1] 343 + paddd m16, m2, [t3+r10*2+416* 8+ 0] + paddd m16, [t3+r10*2+416*12+ 0] + mova [t3+r10*2+416* 8+ 0], m2 + mova [t3+r10*2+416*12+ 0], m1 + mova m17, [t3+r10*2+416*0+64] + paddd m17, [t3+r10*2+416*0+72] + paddd m1, m17, [t3+r10*2+416*0+68] + pslld m1, 2 + psubd m2, m1, m17 + paddd m17, m2, [t3+r10*2+416* 8+64] + paddd m17, [t3+r10*2+416*12+64] + mova [t3+r10*2+416* 8+64], m2 + mova [t3+r10*2+416*12+64], m1 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vshufi32x4 m1, m16, m17, q2020 + vshufi32x4 m16, m17, q3131 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m16, m3 + psrad m1, 9 + psrad m16, 9 + packssdw m1, m16 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 64 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m3, [t4+r10*1+416*2+0] + paddw m3, [t4+r10*1+416*2+4] + paddw m1, m3, [t4+r10*1+416*2+2] + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+r10*1+416*6] + paddw m3, [t4+r10*1+416*8] + mova [t4+r10*1+416*6], m1 + mova [t4+r10*1+416*8], m2 + mova m16, [t3+r10*2+416*4+0] + paddd m16, [t3+r10*2+416*4+8] + paddd m1, m16, [t3+r10*2+416*4+4] + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m16 ; b[ 1] 343 + paddd m16, m2, [t3+r10*2+416*12+ 0] + paddd m16, [t3+r10*2+416*16+ 0] + mova [t3+r10*2+416*12+ 0], m1 + mova [t3+r10*2+416*16+ 0], m2 + mova m17, [t3+r10*2+416*4+64] + paddd m17, [t3+r10*2+416*4+72] + paddd m1, m17, [t3+r10*2+416*4+68] + pslld m1, 2 + psubd m2, m1, m17 + paddd m17, m2, [t3+r10*2+416*12+64] + paddd m17, [t3+r10*2+416*16+64] + mova [t3+r10*2+416*12+64], m1 + mova [t3+r10*2+416*16+64], m2 + mova m0, [dstq+r10] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + vshufi32x4 m1, m16, m17, q2020 + vshufi32x4 m16, m17, q3131 + psubd m1, m2 ; b - a * src + (1 << 8) + psubd m16, m3 + psrad m1, 9 + psrad m16, 9 + packssdw m1, m16 + pmulhrsw m1, m7 + paddw m0, m1 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+r10], m0 + add r10, 64 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \ + w, h, edge, params + movifnidn wd, wm + mov paramsq, r6mp + lea r13, [r_ext_mask+72] + mov edged, r7m + movifnidn hd, hm + vpbroadcastd m7, [paramsq+8] ; w0 w1 + pxor m6, m6 + vpbroadcastd m8, [base+pd_8] + add wd, wd + vpbroadcastd m9, [base+pd_m9] + add lpfq, wq + vpbroadcastd m10, [base+pd_m25] + add dstq, wq + vpsubd m11, m6, [paramsq+0] {1to16} ; -s0 + lea t3, [rsp+wq*2+416*24+8] + vpsubd m12, m6, [paramsq+4] {1to16} ; -s1 + lea t4, [rsp+wq+416*52+8] + vpbroadcastd m13, [base+pw_164_455] + lea t1, [rsp+wq+12] + vpbroadcastd m14, [base+pw_61448] + neg wq + vpbroadcastd m15, [base+pd_m34816] + psllw m7, 2 + vpbroadcastd m22, [base+pd_2147483648] + mov r10d, 0xfffffff8 + mova m18, [sgr_x_by_x+64*0] + kmovd k1, r10d + mova m19, [sgr_x_by_x+64*1] + mov r10, 0x3333333333333333 + mova m20, [sgr_x_by_x+64*2] + kmovq k2, r10 + mova m21, [sgr_x_by_x+64*3] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup + add t1, 416*12 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea r10, [wq-4] + lea t2, [t1+416*12] +.top_fixup_loop: + mova m0, [t1+r10+416* 0] + mova m1, [t1+r10+416* 2] + mova m2, [t1+r10+416* 4] + paddw m0, m0 + mova m3, [t1+r10+416* 6] + paddd m1, m1 + mova m4, [t1+r10+416* 8] + paddd m2, m2 + mova m5, [t1+r10+416*10] + mova [t2+r10+416* 0], m0 + mova [t2+r10+416* 2], m1 + mova [t2+r10+416* 4], m2 + mova [t2+r10+416* 6], m3 + mova [t2+r10+416* 8], m4 + mova [t2+r10+416*10], m5 + add r10, 64 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq xm16, [leftq+2] + vmovdqu16 m16{k1}, [lpfq+wq-6] + add leftq, 8 + jmp .h_main +.h_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-6] + jmp .h_main +.h_top: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m16, [lpfq+r10- 2] +.h_main: + movu m17, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -68 + jl .h_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.h_have_right: + palignr m3, m17, m16, 2 + palignr m0, m17, m16, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m17, m16, 6 + paddw m1, m0 ; sum3 + punpcklwd m4, m0, m6 + vpdpwssd m2, m4, m4 ; sumsq3 + punpckhwd m0, m6 + vpdpwssd m3, m0, m0 + shufpd m4, m16, m17, 0x55 + punpcklwd m17, m4, m16 + paddw m0, m16, m4 + punpckhwd m4, m16 + mova [t1+r10+416* 6], m1 + mova [t1+r10+416* 8], m2 + mova [t1+r10+416*10], m3 + paddw m1, m0 ; sum5 + vpdpwssd m2, m17, m17 ; sumsq5 + vpdpwssd m3, m4, m4 + mova [t1+r10+416* 0], m1 + mova [t1+r10+416* 2], m2 + mova [t1+r10+416* 4], m3 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movq xm16, [leftq+2] + vmovdqu16 m16{k1}, [lpfq+wq-6] + add leftq, 8 + jmp .hv0_main +.hv0_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-6] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu m16, [lpfq+r10- 2] +.hv0_main: + movu m17, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -68 + jl .hv0_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.hv0_have_right: + palignr m3, m17, m16, 2 + palignr m0, m17, m16, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m17, m16, 6 + paddw m1, m0 ; h sum3 + punpcklwd m4, m0, m6 + vpdpwssd m2, m4, m4 ; h sumsq3 + punpckhwd m0, m6 + vpdpwssd m3, m0, m0 + shufpd m17, m16, m17, 0x55 + paddw m4, m1, [t1+r10+416* 6] + paddd m5, m2, [t1+r10+416* 8] + mova [t1+r10+416* 6], m1 + mova [t1+r10+416* 8], m2 + paddw m1, m16 + paddw m1, m17 ; h sum5 + punpcklwd m0, m17, m16 + vpdpwssd m2, m0, m0 ; h sumsq5 + paddd m0, m3, [t1+r10+416*10] + mova [t1+r10+416*10], m3 + punpckhwd m17, m16 + vpdpwssd m3, m17, m17 + mova [t3+r10*2+416*8+ 8], m1 ; we need a clean copy of the last row + mova [t3+r10*2+416*0+ 8], m2 ; in case height is odd + mova [t3+r10*2+416*0+72], m3 + paddw m1, [t1+r10+416* 0] + paddd m2, [t1+r10+416* 2] + paddd m3, [t1+r10+416* 4] + mova [t1+r10+416* 0], m1 + mova [t1+r10+416* 2], m2 + mova [t1+r10+416* 4], m3 + paddw m17, m4, [t2+r10+416* 6] + paddd m2, m5, [t2+r10+416* 8] + paddd m3, m0, [t2+r10+416*10] + mova [t2+r10+416* 6], m4 + mova [t2+r10+416* 8], m5 + mova [t2+r10+416*10], m0 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pmulld m2, m9 ; -((a3 + 8) >> 4) * 9 + pmulld m3, m9 + psrlw m5, m17, 1 + pavgw m5, m6 ; (b3 + 2) >> 2 + punpcklwd m4, m5, m6 + vpdpwssd m2, m4, m4 ; -p3 + punpckhwd m5, m6 + vpdpwssd m3, m5, m5 + punpcklwd m16, m6, m17 ; b3 + punpckhwd m17, m6, m17 + pminsd m2, m6 + pminsd m3, m6 + pmulld m2, m12 ; p3 * s1 + pmulld m3, m12 + pmaddwd m16, m13 ; b3 * 455 + pmaddwd m17, m13 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + paddusw m3, m14 + psraw m3, 4 ; min(z3, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x3 + pandn m2, m15, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + mova [t4+r10*1+416*2+4], m2 + psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*4+ 8], xm16 + mova [t3+r10*2+416*4+ 24], xm17 + vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*4+104], m16, 3 + vextracti32x4 [t3+r10*2+416*4+120], m17, 3 + add r10, 64 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movq xm16, [leftq+2] + vmovdqu16 m16{k1}, [lpfq+wq-6] + add leftq, 8 + jmp .hv1_main +.hv1_extend_left: + vpbroadcastw xm16, [lpfq+wq] + vmovdqu16 m16{k1}, [lpfq+wq-6] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-4] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu m16, [lpfq+r10- 2] +.hv1_main: + movu m17, [lpfq+r10+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -68 + jl .hv1_have_right + vpbroadcastw m0, [lpfq-2] + vpternlogd m16, m0, [r13+r10+ 0], 0xe4 + vpternlogd m17, m0, [r13+r10+16], 0xe4 +.hv1_have_right: + palignr m1, m17, m16, 2 + palignr m3, m17, m16, 4 + paddw m2, m1, m3 + punpcklwd m0, m1, m3 + pmaddwd m0, m0 + punpckhwd m1, m3 + pmaddwd m1, m1 + palignr m3, m17, m16, 6 + paddw m2, m3 ; h sum3 + punpcklwd m5, m3, m6 + vpdpwssd m0, m5, m5 ; h sumsq3 + punpckhwd m3, m6 + vpdpwssd m1, m3, m3 + shufpd m3, m16, m17, 0x55 + punpcklwd m5, m16, m3 + paddw m4, m16, m3 + punpckhwd m16, m3 + paddw m17, m2, [t2+r10+416* 6] + mova [t2+r10+416* 6], m2 + paddw m4, m2 ; h sum5 + paddd m2, m0, [t2+r10+416* 8] + paddd m3, m1, [t2+r10+416*10] + mova [t2+r10+416* 8], m0 + mova [t2+r10+416*10], m1 + vpdpwssd m0, m5, m5 ; h sumsq5 + vpdpwssd m1, m16, m16 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pmulld m2, m9 ; -((a3 + 8) >> 4) * 9 + pmulld m3, m9 + psrlw m16, m17, 1 + pavgw m16, m6 ; (b3 + 2) >> 2 + punpcklwd m5, m16, m6 + vpdpwssd m2, m5, m5 ; -p3 + punpckhwd m16, m6 + vpdpwssd m3, m16, m16 + punpcklwd m16, m6, m17 ; b3 + punpckhwd m17, m6, m17 + pminsd m2, m6 + pminsd m3, m6 + pmulld m2, m12 ; p3 * s1 + pmulld m3, m12 + pmaddwd m16, m13 ; b3 * 455 + pmaddwd m17, m13 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + paddusw m3, m14 + psraw m3, 4 ; min(z3, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x3 + pandn m2, m15, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + mova [t4+r10*1+416*4+4], m2 + psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + paddw m5, m4, [t2+r10+416*0] + paddd m2, m0, [t2+r10+416*2] + paddd m3, m1, [t2+r10+416*4] + paddw m5, [t1+r10+416*0] + paddd m2, [t1+r10+416*2] + paddd m3, [t1+r10+416*4] + mova [t2+r10+416*0], m4 + mova [t2+r10+416*2], m0 + mova [t2+r10+416*4], m1 + mova [t3+r10*2+416*8+ 8], xm16 + mova [t3+r10*2+416*8+ 24], xm17 + vextracti128 [t3+r10*2+416*8+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*8+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*8+104], m16, 3 + vextracti32x4 [t3+r10*2+416*8+120], m17, 3 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m10 ; -((a5 + 8) >> 4) * 25 + pmulld m3, m10 + psrlw m17, m5, 1 + pavgw m17, m6 ; (b5 + 2) >> 2 + punpcklwd m16, m17, m6 + vpdpwssd m2, m16, m16 ; -p5 + punpckhwd m17, m6 + vpdpwssd m3, m17, m17 + punpcklwd m16, m5, m6 ; b5 + punpckhwd m17, m5, m6 + pmulld m2, m11 ; p5 * s0 + pmulld m3, m11 + pmaddwd m16, m13 ; b5 * 164 + pmaddwd m17, m13 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + pmaxsw m3, m6 + paddusw m3, m14 + psraw m3, 4 ; min(z5, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x5 + pandn m2, m15, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + mova [t4+r10*1+416*0+4], m2 + psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*0+ 8], xm16 + mova [t3+r10*2+416*0+ 24], xm17 + vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*0+104], m16, 3 + vextracti32x4 [t3+r10*2+416*0+120], m17, 3 + add r10, 64 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-4] +.v0_loop: + mova m16, [t1+r10+416* 6] + mova m2, [t1+r10+416* 8] + mova m3, [t1+r10+416*10] + paddw m16, m16 + paddd m2, m2 + paddd m3, m3 + paddw m17, m16, [t2+r10+416* 6] + paddd m4, m2, [t2+r10+416* 8] + paddd m5, m3, [t2+r10+416*10] + mova [t2+r10+416* 6], m16 + mova [t2+r10+416* 8], m2 + mova [t2+r10+416*10], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a3 + 8) >> 4 + psrld m5, 4 + pmulld m4, m9 ; -((a3 + 8) >> 4) * 9 + pmulld m5, m9 + psrlw m3, m17, 1 + pavgw m3, m6 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m6 + vpdpwssd m4, m2, m2 ; -p3 + punpckhwd m3, m6 + vpdpwssd m5, m3, m3 + punpcklwd m16, m6, m17 ; b3 + punpckhwd m17, m6, m17 + pminsd m4, m6 + pminsd m5, m6 + pmulld m4, m12 ; p3 * s1 + pmulld m5, m12 + pmaddwd m16, m13 ; b3 * 455 + pmaddwd m17, m13 + vpalignr m5{k2}, m4, m4, 2 + mova m4, m20 + paddusw m5, m14 + psraw m5, 4 ; min(z3, 255) - 256 + vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m5 + vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m5{k3}, m4 ; x3 + pandn m4, m15, m5 + psrld m5, 16 + pmulld m16, m4 + pmulld m17, m5 + packssdw m4, m5 + mova [t4+r10*1+416*2+4], m4 + psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + mova m3, [t1+r10+416*0] + mova m4, [t1+r10+416*2] + mova m5, [t1+r10+416*4] + mova [t3+r10*2+416*8+ 8], m3 + mova [t3+r10*2+416*0+ 8], m4 + mova [t3+r10*2+416*0+72], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+r10+416*0], m3 + mova [t1+r10+416*2], m4 + mova [t1+r10+416*4], m5 + mova [t3+r10*2+416*4+ 8], xm16 + mova [t3+r10*2+416*4+ 24], xm17 + vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*4+104], m16, 3 + vextracti32x4 [t3+r10*2+416*4+120], m17, 3 + add r10, 64 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-4] +.v1_loop: + mova m16, [t1+r10+416* 6] + mova m2, [t1+r10+416* 8] + mova m3, [t1+r10+416*10] + paddw m17, m16, [t2+r10+416* 6] + paddd m4, m2, [t2+r10+416* 8] + paddd m5, m3, [t2+r10+416*10] + mova [t2+r10+416* 6], m16 + mova [t2+r10+416* 8], m2 + mova [t2+r10+416*10], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a3 + 8) >> 4 + psrld m5, 4 + pmulld m4, m9 ; -((a3 + 8) >> 4) * 9 + pmulld m5, m9 + psrlw m3, m17, 1 + pavgw m3, m6 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m6 + vpdpwssd m4, m2, m2 ; -p3 + punpckhwd m3, m6 + vpdpwssd m5, m3, m3 + punpcklwd m16, m6, m17 ; b3 + punpckhwd m17, m6, m17 + pminsd m4, m6 + pminsd m5, m6 + pmulld m4, m12 ; p3 * s1 + pmulld m5, m12 + pmaddwd m16, m13 ; b3 * 455 + pmaddwd m17, m13 + vpalignr m5{k2}, m4, m4, 2 + mova m4, m20 + paddusw m5, m14 + psraw m5, 4 ; min(z3, 255) - 256 + vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m5 + vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m5{k3}, m4 ; x3 + pandn m4, m15, m5 + psrld m5, 16 + pmulld m16, m4 + pmulld m17, m5 + packssdw m4, m5 + mova [t4+r10*1+416*4+4], m4 + psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + mova m0, [t3+r10*2+416*8+ 8] + mova m4, [t3+r10*2+416*0+ 8] + mova m5, [t3+r10*2+416*0+72] + paddw m1, m0, [t2+r10+416*0] + paddd m2, m4, [t2+r10+416*2] + paddd m3, m5, [t2+r10+416*4] + paddw m1, [t1+r10+416*0] + paddd m2, [t1+r10+416*2] + paddd m3, [t1+r10+416*4] + mova [t2+r10+416*0], m0 + mova [t2+r10+416*2], m4 + mova [t2+r10+416*4], m5 + mova [t3+r10*2+416*8+ 8], xm16 + mova [t3+r10*2+416*8+ 24], xm17 + vextracti128 [t3+r10*2+416*8+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*8+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*8+104], m16, 3 + vextracti32x4 [t3+r10*2+416*8+120], m17, 3 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + pmulld m2, m10 ; -((a5 + 8) >> 4) * 25 + pmulld m3, m10 + psrlw m5, m1, 1 + pavgw m5, m6 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m6 + vpdpwssd m2, m4, m4 ; -p5 + punpckhwd m5, m6 + vpdpwssd m3, m5, m5 + punpcklwd m16, m1, m6 ; b5 + punpckhwd m17, m1, m6 + pmulld m2, m11 ; p5 * s0 + pmulld m3, m11 + pmaddwd m16, m13 ; b5 * 164 + pmaddwd m17, m13 + vpalignr m3{k2}, m2, m2, 2 + mova m2, m20 + pmaxsw m3, m6 + paddusw m3, m14 + psraw m3, 4 ; min(z5, 255) - 256 + vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m3 + vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m3{k3}, m2 ; x5 + pandn m2, m15, m3 + psrld m3, 16 + pmulld m16, m2 + pmulld m17, m3 + packssdw m2, m3 + mova [t4+r10*1+416*0+4], m2 + psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + psubd m17, m15 + psrld m16, 12 + psrld m17, 12 + mova [t3+r10*2+416*0+ 8], xm16 + mova [t3+r10*2+416*0+ 24], xm17 + vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 + vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 + vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 + vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 + vextracti32x4 [t3+r10*2+416*0+104], m16, 3 + vextracti32x4 [t3+r10*2+416*0+120], m17, 3 + add r10, 64 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu ym0, [t4+r10*1+416*0+2] + paddw ym2, ym0, [t4+r10*1+416*0+0] + paddw ym2, [t4+r10*1+416*0+4] + movu m1, [t3+r10*2+416*0+4] + paddd m3, m1, [t3+r10*2+416*0+0] + paddd m3, [t3+r10*2+416*0+8] + paddw ym0, ym2 + paddd m1, m3 + psllw ym2, 2 + pslld m3, 2 + paddw ym0, ym2 ; a5 565 + paddd m1, m3 ; b5 565 + mova [t4+r10*1+416* 6], ym0 + mova [t3+r10*2+416*12], m1 + mova ym0, [t4+r10*1+416*2+0] + paddw ym0, [t4+r10*1+416*2+4] + paddw ym2, ym0, [t4+r10*1+416*2+2] + mova m1, [t3+r10*2+416*4+0] + paddd m1, [t3+r10*2+416*4+8] + paddd m3, m1, [t3+r10*2+416*4+4] + psllw ym2, 2 ; a3[-1] 444 + pslld m3, 2 ; b3[-1] 444 + psubw ym2, ym0 ; a3[-1] 343 + psubd m3, m1 ; b3[-1] 343 + mova [t4+r10*1+416* 8], ym2 + mova [t3+r10*2+416*16], m3 + mova ym0, [t4+r10*1+416*4+0] + paddw ym0, [t4+r10*1+416*4+4] + paddw ym2, ym0, [t4+r10*1+416*4+2] + mova m1, [t3+r10*2+416*8+0] + paddd m1, [t3+r10*2+416*8+8] + paddd m3, m1, [t3+r10*2+416*8+4] + psllw ym2, 2 ; a3[ 0] 444 + pslld m3, 2 ; b3[ 0] 444 + mova [t4+r10*1+416*10], ym2 + mova [t3+r10*2+416*20], m3 + psubw ym2, ym0 ; a3[ 0] 343 + psubd m3, m1 ; b3[ 0] 343 + mova [t4+r10*1+416*12], ym2 + mova [t3+r10*2+416*24], m3 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu ym2, [t4+r10*1+2] + paddw ym0, ym2, [t4+r10*1+0] + paddw ym0, [t4+r10*1+4] + paddw ym2, ym0 + psllw ym0, 2 + paddw ym0, ym2 ; a5 + movu m1, [t3+r10*2+4] + paddd m4, m1, [t3+r10*2+0] + paddd m4, [t3+r10*2+8] + paddd m1, m4 + pslld m4, 2 + paddd m4, m1 ; b5 + paddw ym2, ym0, [t4+r10*1+416* 6] + mova [t4+r10*1+416* 6], ym0 + paddd m0, m4, [t3+r10*2+416*12] + mova [t3+r10*2+416*12], m4 + mova ym3, [t4+r10*1+416*2+0] + paddw ym3, [t4+r10*1+416*2+4] + paddw ym5, ym3, [t4+r10*1+416*2+2] + psllw ym5, 2 ; a3[ 1] 444 + psubw ym4, ym5, ym3 ; a3[ 1] 343 + paddw ym3, ym4, [t4+r10*1+416* 8] + paddw ym3, [t4+r10*1+416*10] + mova [t4+r10*1+416* 8], ym4 + mova [t4+r10*1+416*10], ym5 + mova m1, [t3+r10*2+416*4+0] + paddd m1, [t3+r10*2+416*4+8] + paddd m5, m1, [t3+r10*2+416*4+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m1 ; b3[ 1] 343 + paddd m1, m4, [t3+r10*2+416*16] + paddd m1, [t3+r10*2+416*20] + mova [t3+r10*2+416*16], m4 + mova [t3+r10*2+416*20], m5 + pmovzxwd m4, [dstq+r10] + pmovzxwd m2, ym2 ; a5 + pmovzxwd m3, ym3 ; a3 + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + vpshldd m4, m22, 13 + psubd m0, m2 ; b5 - a5 * src + (1 << 8) + psubd m1, m3 ; b3 - a3 * src + (1 << 8) + psrld m0, 9 + pslld m1, 7 + vpblendmb m0{k2}, m1, m0 + vpdpwssd m4, m0, m7 + psrad m4, 7 + pmaxsd m4, m6 + vpmovusdw ym16, m4 ; clip + psrlw ym16, 6 + mova [dstq+r10], ym16 + add r10, 32 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova ym3, [t4+r10*1+416*4+0] + paddw ym3, [t4+r10*1+416*4+4] + paddw ym5, ym3, [t4+r10*1+416*4+2] + psllw ym5, 2 ; a3[ 1] 444 + psubw ym4, ym5, ym3 ; a3[ 1] 343 + paddw ym3, ym4, [t4+r10*1+416*12] + paddw ym3, [t4+r10*1+416*10] + mova [t4+r10*1+416*10], ym5 + mova [t4+r10*1+416*12], ym4 + mova m0, [t3+r10*2+416*8+0] + paddd m0, [t3+r10*2+416*8+8] + paddd m5, m0, [t3+r10*2+416*8+4] + pslld m5, 2 ; b3[ 1] 444 + psubd m4, m5, m0 ; b3[ 1] 343 + paddd m0, m4, [t3+r10*2+416*24] + paddd m0, [t3+r10*2+416*20] + mova [t3+r10*2+416*20], m5 + mova [t3+r10*2+416*24], m4 + pmovzxwd m4, [dstq+r10] + pmovzxwd m2, [t4+r10*1+416* 6] + pmovzxwd m3, ym3 + mova m1, [t3+r10*2+416*12] + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + vpshldd m4, m22, 13 + psubd m1, m2 ; b5 - a5 * src + (1 << 8) + psubd m0, m3 ; b3 - a3 * src + (1 << 8) + pslld m0, 7 + vpalignr m0{k2}, m1, m1, 1 + vpdpwssd m4, m0, m7 + psrad m4, 7 + pmaxsd m4, m6 + vpmovusdw ym16, m4 ; clip + psrlw ym16, 6 + mova [dstq+r10], ym16 + add r10, 32 + jl .n1_loop + add dstq, strideq + ret + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/looprestoration16_sse.asm b/third_party/dav1d/src/x86/looprestoration16_sse.asm new file mode 100644 index 0000000000..872e502982 --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration16_sse.asm @@ -0,0 +1,3723 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 +wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 +wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 +wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 +wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 +sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +pb_m14_m13: times 8 db -14,-13 +pb_m10_m9: times 8 db -10, -9 +pb_m6_m5: times 8 db -6, -5 +pb_m2_m1: times 8 db -2, -1 +pb_2_3: times 8 db 2, 3 +pb_6_7: times 8 db 6, 7 +pw_256: times 8 dw 256 +pw_1023: times 8 dw 1023 +pd_8: times 4 dd 8 +pd_4096: times 4 dd 4096 +pd_34816: times 4 dd 34816 +pd_m262128: times 4 dd -262128 +pd_0xffff: times 4 dd 0xffff +pd_0xf00800a4: times 4 dd 0xf00800a4 +pd_0xf00801c7: times 4 dd 0xf00801c7 +pd_0xfffffff0: times 4 dd 0xfffffff0 + +wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 +wiener_round: dd 1049600, 1048832 + +cextern sgr_x_by_x + +SECTION .text + +%macro movif64 2 ; dst, src + %if ARCH_X86_64 + mov %1, %2 + %endif +%endmacro + +%macro movif32 2 ; dst, src + %if ARCH_X86_32 + mov %1, %2 + %endif +%endmacro + +INIT_XMM ssse3 +%if ARCH_X86_32 +DECLARE_REG_TMP 5, 6 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 13*16 + %else + %assign extra_stack 12*16 + %endif +cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \ + dst, stride, left, lpf, w, flt + %if STACK_ALIGNMENT < 16 + %define lpfm dword [esp+calloff+16*12+ 0] + %define wm dword [esp+calloff+16*12+ 4] + %define hd dword [esp+calloff+16*12+ 8] + %define edgeb byte [esp+calloff+16*12+12] + %define edged dword [esp+calloff+16*12+12] + %else + %define hd dword r5m + %define edgeb byte r7m + %endif + %define PICmem dword [esp+calloff+4*0] + %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers + %define t1m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define t5m dword [esp+calloff+4*6] + %define t6m dword [esp+calloff+4*7] + %define t2 t2m + %define t3 t3m + %define t4 t4m + %define t5 t5m + %define t6 t6m + %define m8 [esp+calloff+16*2] + %define m9 [esp+calloff+16*3] + %define m10 [esp+calloff+16*4] + %define m11 [esp+calloff+16*5] + %define m12 [esp+calloff+16*6] + %define m13 [esp+calloff+16*7] + %define m14 [esp+calloff+16*8] + %define m15 [esp+calloff+16*9] + %define r10 r4 + %define base t0-wiener_shifts + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov wd, [rstk+stack_offset+20] + mov wm, wd + mov r5, [rstk+stack_offset+24] + mov hd, r5 + mov r5, [rstk+stack_offset+32] + mov edged, r5 ; edge + %endif +%else +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers +cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt + %define base +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max + movq m13, [fltq] + movq m15, [fltq+16] +%else + %if STACK_ALIGNMENT < 16 + mov t0, [rstk+stack_offset+28] + mov t1, [rstk+stack_offset+36] ; pixel_max + movq m1, [t0] ; fx + movq m3, [t0+16] ; fy + LEA t0, wiener_shifts + %else + mov fltq, r6m + movq m1, [fltq] + movq m3, [fltq+16] + LEA t0, wiener_shifts + mov t1, r8m ; pixel_max + %endif + mov PICmem, t0 +%endif + mova m6, [base+wiener_shufA] + mova m7, [base+wiener_shufB] +%if ARCH_X86_64 + lea t4, [wiener_shifts] + add wd, wd + pshufd m12, m13, q0000 ; x0 x1 + pshufd m13, m13, q1111 ; x2 x3 + pshufd m14, m15, q0000 ; y0 y1 + pshufd m15, m15, q1111 ; y2 y3 + mova m8, [wiener_shufC] + mova m9, [wiener_shufD] + add lpfq, wq + lea t1, [rsp+wq+16] + add dstq, wq + neg wq + shr t3d, 11 + %define base t4-wiener_shifts + movd m10, [base+wiener_round+t3*4] + movq m11, [base+wiener_shifts+t3*8] + pshufd m10, m10, q0000 + pshufd m0, m11, q0000 + pshufd m11, m11, q1111 + pmullw m12, m0 ; upshift filter coefs to make the + pmullw m13, m0 ; horizontal downshift constant + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] + %define base + %define wiener_lshuf7_mem [wiener_lshuf7] + %define pd_m262128_mem [pd_m262128] +%else + add wd, wd + mova m4, [base+wiener_shufC] + mova m5, [base+wiener_shufD] + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + pshufd m2, m3, q0000 + pshufd m3, m3, q1111 + mova m8, m4 + mova m9, m5 + mova m14, m2 + mova m15, m3 + shr t1, 11 + add lpfq, wq + mova m3, [base+pd_m262128] + movd m4, [base+wiener_round+t1*4] + movq m5, [base+wiener_shifts+t1*8] + lea t1, [esp+extra_stack+wq+16] + add dstq, wq + neg wq + pshufd m4, m4, q0000 + pshufd m2, m5, q0000 + pshufd m5, m5, q1111 + mov wm, wq + pmullw m0, m2 + pmullw m1, m2 + mova m2, [base+wiener_lshuf7] + %define pd_m262128_mem [esp+calloff+16*10] + mova pd_m262128_mem, m3 + mova m10, m4 + mova m11, m5 + mova m12, m0 + mova m13, m1 + %define wiener_lshuf7_mem [esp+calloff+16*11] + mova wiener_lshuf7_mem, m2 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + add r10, strideq + mov lpfm, r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, lpfm + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v + movif32 wq, wm +.v2: + call .v + movif32 wq, wm + jmp .v1 +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movif32 t0, PICmem + pxor m0, m0 + movd m1, wd + mova m2, [base+pb_0to15] + pshufb m1, m0 + mova m0, [base+pb_6_7] + psubb m0, m1 + pminub m0, m2 + pshufb m3, m0 + mova m0, [base+pb_m2_m1] + psubb m0, m1 + pminub m0, m2 + pshufb m4, m0 + mova m0, [base+pb_m10_m9] + psubb m0, m1 + pminub m0, m2 + pshufb m5, m0 + movif32 t0, t0m + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: + movif64 wq, r4 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movq m3, [leftq] + movhps m3, [lpfq+wq] + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, wiener_lshuf7_mem ; before the start of the buffer + jmp .h_main +.h_top: + movif64 wq, r4 + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+wq-8] +.h_main: + mova m4, [lpfq+wq+0] + movu m5, [lpfq+wq+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+wq], m0 + add wq, 16 + jl .h_loop + movif32 wq, wm + ret +ALIGN function_align +.hv: + add lpfq, strideq + movif64 wq, r4 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movq m3, [leftq] + movhps m3, [lpfq+wq] + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m3, [lpfq+wq] + pshufb m3, wiener_lshuf7_mem + jmp .hv_main +.hv_bottom: + movif64 wq, r4 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+wq-8] +.hv_main: + mova m4, [lpfq+wq+0] + movu m5, [lpfq+wq+8] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -20 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t1, t4m + movif32 t0, t2m + pshufb m0, m3, m6 + pshufb m1, m4, m7 + paddw m0, m1 + pshufb m3, m8 + pmaddwd m0, m12 + pshufb m1, m4, m9 + paddw m3, m1 + pshufb m1, m4, m6 + pmaddwd m3, m13 + pshufb m2, m5, m7 + paddw m1, m2 + mova m2, pd_m262128_mem + pshufb m4, m8 + pmaddwd m1, m12 + pshufb m5, m9 + paddw m4, m5 + pmaddwd m4, m13 + paddd m0, m2 + paddd m1, m2 +%if ARCH_X86_64 + mova m2, [t4+wq] + paddw m2, [t2+wq] + mova m5, [t3+wq] +%else + mova m2, [t1+wq] + paddw m2, [t0+wq] + mov t1, t3m + mov t0, t5m + mova m5, [t1+wq] + mov t1, t1m +%endif + paddd m0, m3 + paddd m1, m4 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 +%if ARCH_X86_64 + mova m4, [t5+wq] + paddw m4, [t1+wq] + psraw m0, 1 + paddw m3, m0, [t6+wq] +%else + mova m4, [t0+wq] + paddw m4, [t1+wq] + mov t0, t0m + mov t1, t6m + psraw m0, 1 + paddw m3, m0, [t1+wq] +%endif + mova [t0+wq], m0 + punpcklwd m0, m2, m5 + pmaddwd m0, m15 + punpckhwd m2, m5 + pmaddwd m2, m15 + punpcklwd m1, m3, m4 + pmaddwd m1, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m2, m10 + paddd m0, m1 + paddd m2, m3 + psrad m0, 6 + psrad m2, 6 + packssdw m0, m2 + pmulhw m0, m11 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .hv_loop +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 +%else + mov r4, t5m + mov t1, t4m + mov t6m, r4 + mov t5m, t1 + mov r4, t3m + mov t1, t2m + mov t4m, r4 + mov t3m, t1 + mov r4, t1m + mov t1, t0 + mov t2m, r4 + mov t0, t6m + mov wq, wm +%endif + add dstq, strideq + ret +.v: + movif64 wq, r4 + movif32 t0m, t0 + movif32 t1m, t1 +.v_loop: +%if ARCH_X86_64 + mova m1, [t4+wq] + paddw m1, [t2+wq] + mova m2, [t3+wq] + mova m4, [t1+wq] + paddw m3, m4, [t6+wq] + paddw m4, [t5+wq] +%else + mov t0, t4m + mov t1, t2m + mova m1, [t0+wq] + paddw m1, [t1+wq] + mov t0, t3m + mov t1, t1m + mova m2, [t0+wq] + mova m4, [t1+wq] + mov t0, t6m + mov t1, t5m + paddw m3, m4, [t0+wq] + paddw m4, [t1+wq] +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m15 + punpckhwd m1, m2 + pmaddwd m1, m15 + punpcklwd m2, m3, m4 + pmaddwd m2, m14 + punpckhwd m3, m4 + pmaddwd m3, m14 + paddd m0, m10 + paddd m1, m10 + paddd m0, m2 + paddd m1, m3 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pmulhw m0, m11 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .v_loop +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 +%else + mov t0, t5m + mov t1, t4m + mov r4, t3m + mov t6m, t0 + mov t5m, t1 + mov t4m, r4 + mov r4, t2m + mov t1, t1m + mov t0, t0m + mov t3m, r4 + mov t2m, t1 +%endif + add dstq, strideq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign stack_size 12*16+384*8 + %else + %assign stack_size 11*16+384*8 + %endif +cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \ + lpf, w, flt + %if STACK_ALIGNMENT < 16 + %define lpfm dword [esp+calloff+4*6] + %define wm dword [esp+calloff+4*7] + %define hd dword [esp+calloff+16*10+0] + %define edgeb byte [esp+calloff+16*10+4] + %define edged dword [esp+calloff+16*10+4] + %else + %define hd dword r5m + %define edgeb byte r7m + %endif + %define PICmem dword [esp+calloff+4*0] + %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers + %define t1m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define t2 t2m + %define t3 t3m + %define t4 t4m + %define m8 [esp+calloff+16*2] + %define m9 [esp+calloff+16*3] + %define m10 [esp+calloff+16*4] + %define m11 [esp+calloff+16*5] + %define m12 [esp+calloff+16*6] + %define m13 [esp+calloff+16*7] + %define m14 [esp+calloff+16*8] + %define m15 [esp+calloff+16*9] + %define base t0-wiener_shifts + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov wd, [rstk+stack_offset+20] + mov wm, wd + mov r5, [rstk+stack_offset+24] + mov hd, r5 + mov r5, [rstk+stack_offset+32] + mov edged, r5 ; edge + %endif +%else +cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt + %define base +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m + mov t3d, r8m ; pixel_max + movq m12, [fltq] + movq m14, [fltq+16] +%else + %if STACK_ALIGNMENT < 16 + mov t0, [rstk+stack_offset+28] + mov t1, [rstk+stack_offset+36] ; pixel_max + movq m1, [t0] ; fx + movq m3, [t0+16] ; fy + LEA t0, wiener_shifts + %else + mov fltq, r6m + movq m1, [fltq] + movq m3, [fltq+16] + LEA t0, wiener_shifts + mov t1, r8m ; pixel_max + %endif + mov PICmem, t0 +%endif + mova m5, [base+wiener_shufE] + mova m6, [base+wiener_shufB] + mova m7, [base+wiener_shufD] +%if ARCH_X86_64 + lea t4, [wiener_shifts] + add wd, wd + punpcklwd m11, m12, m12 + pshufd m11, m11, q1111 ; x1 + pshufd m12, m12, q1111 ; x2 x3 + punpcklwd m13, m14, m14 + pshufd m13, m13, q1111 ; y1 + pshufd m14, m14, q1111 ; y2 y3 + shr t3d, 11 + mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) + add lpfq, wq + lea t1, [rsp+wq+16] + add dstq, wq + neg wq + %define base t4-wiener_shifts + movd m9, [base+wiener_round+t3*4] + movq m10, [base+wiener_shifts+t3*8] + pshufd m9, m9, q0000 + pshufd m0, m10, q0000 + pshufd m10, m10, q1111 + mova m15, [wiener_lshuf5] + pmullw m11, m0 + pmullw m12, m0 + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] + %define base +%else + add wd, wd + punpcklwd m0, m1, m1 + pshufd m0, m0, q1111 ; x1 + pshufd m1, m1, q1111 ; x2 x3 + punpcklwd m2, m3, m3 + pshufd m2, m2, q1111 ; y1 + pshufd m3, m3, q1111 ; y2 y3 + mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) + mova m13, m2 + mova m14, m3 + mova m8, m4 + shr t1, 11 + add lpfq, wq + movd m2, [base+wiener_round+t1*4] + movq m3, [base+wiener_shifts+t1*8] + %if STACK_ALIGNMENT < 16 + lea t1, [esp+16*11+wq+16] + %else + lea t1, [esp+16*10+wq+16] + %endif + add dstq, wq + neg wq + pshufd m2, m2, q0000 + pshufd m4, m3, q0000 + pshufd m3, m3, q1111 + mov wm, wq + pmullw m0, m4 + pmullw m1, m4 + mova m4, [base+wiener_lshuf5] + mova m9, m2 + mova m10, m3 + mova m11, m0 + mova m12, m1 + mova m15, m4 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + add r10, strideq + mov lpfm, r10 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, lpfm + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.end: + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v +%if ARCH_X86_64 + mov t4, t3 + mov t3, t2 + mov t2, t1 +%else + mov t0, t3m + mov r4, t2m + mov t1, t1m + mov t4m, t0 + mov t3m, r4 + mov t2m, t1 + mov wq, wm +%endif + add dstq, strideq +.v1: + call .v + jmp .end +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movif32 t0, PICmem + pxor m1, m1 + movd m2, wd + mova m0, [base+pb_2_3] + pshufb m2, m1 + mova m1, [base+pb_m6_m5] + psubb m0, m2 + psubb m1, m2 + mova m2, [base+pb_0to15] + pminub m0, m2 + pminub m1, m2 + pshufb m3, m0 + pshufb m4, m1 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: + movif64 wq, r4 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + mova m4, [lpfq+wq] + movd m3, [leftq+4] + pslldq m4, 4 + por m3, m4 + add leftq, 8 + jmp .h_main +.h_extend_left: + mova m3, [lpfq+wq] ; avoid accessing memory located + pshufb m3, m15 ; before the start of the buffer + jmp .h_main +.h_top: + movif64 wq, r4 + movif32 wq, wm + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m3, [lpfq+wq-4] +.h_main: + movu m4, [lpfq+wq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 + paddd m1, m3 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + psraw m0, 1 + mova [t1+wq], m0 + add wq, 16 + jl .h_loop + movif32 wq, wm + ret +ALIGN function_align +.hv: + add lpfq, strideq + movif64 wq, r4 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + mova m4, [lpfq+wq] + movd m3, [leftq+4] + pslldq m4, 4 + por m3, m4 + add leftq, 8 + jmp .hv_main +.hv_extend_left: + mova m3, [lpfq+wq] + pshufb m3, m15 + jmp .hv_main +.hv_bottom: + movif64 wq, r4 + movif32 t0m, t0 + movif32 t1m, t1 + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m3, [lpfq+wq-4] +.hv_main: + movu m4, [lpfq+wq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t1, t1m + movif32 t0, t3m + pshufb m0, m3, m5 + pmaddwd m0, m11 + pshufb m1, m4, m5 + pmaddwd m1, m11 + pshufb m2, m3, m6 + pshufb m3, m7 + paddw m2, m3 + pshufb m3, m4, m6 + pmaddwd m2, m12 + pshufb m4, m7 + paddw m3, m4 + pmaddwd m3, m12 + paddd m0, m8 + paddd m1, m8 + paddd m0, m2 +%if ARCH_X86_64 + mova m2, [t3+wq] + paddw m2, [t1+wq] + paddd m1, m3 + mova m4, [t2+wq] +%else + mova m2, [t0+wq] + mov t0, t2m + paddw m2, [t1+wq] + mov t1, t4m + paddd m1, m3 + mova m4, [t0+wq] + mov t0, t0m +%endif + punpckhwd m3, m2, m4 + pmaddwd m3, m14 + punpcklwd m2, m4 +%if ARCH_X86_64 + mova m4, [t4+wq] +%else + mova m4, [t1+wq] +%endif + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + pmaddwd m2, m14 + psraw m0, 1 + mova [t0+wq], m0 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 6 + psrad m0, 6 + packssdw m0, m1 + pmulhw m0, m10 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 + jl .hv_loop +%if ARCH_X86_64 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 +%else + mov r4, t3m + mov t1, t2m + mov t4m, r4 + mov t3m, t1 + mov r4, t1m + mov t1, t0 + mov t2m, r4 + mov t0, t4m + mov wq, wm +%endif + add dstq, strideq + ret +.v: + movif64 wq, r4 + movif32 t1m, t1 +.v_loop: +%if ARCH_X86_64 + mova m0, [t1+wq] + paddw m2, m0, [t3+wq] + mova m1, [t2+wq] + mova m4, [t4+wq] +%else + mov t0, t3m + mova m0, [t1+wq] + mov t1, t2m + paddw m2, m0, [t0+wq] + mov t0, t4m + mova m1, [t1+wq] + mova m4, [t0+wq] +%endif + punpckhwd m3, m2, m1 + pmaddwd m3, m14 + punpcklwd m2, m1 + pmaddwd m2, m14 + punpckhwd m1, m0, m4 + pmaddwd m1, m13 + punpcklwd m0, m4 + pmaddwd m0, m13 + paddd m3, m9 + paddd m2, m9 + paddd m1, m3 + paddd m0, m2 + psrad m1, 6 + psrad m0, 6 + packssdw m0, m1 + pmulhw m0, m10 + pxor m1, m1 + pmaxsw m0, m1 + mova [dstq+wq], m0 + add wq, 16 +%if ARCH_X86_64 + jl .v_loop +%else + jge .v_end + mov t1, t1m + jmp .v_loop +.v_end: +%endif + ret + +%macro GATHERDD 3 ; dst, src, tmp + movd %3d, %2 + %if ARCH_X86_64 + movd %1, [r13+%3] + pextrw %3d, %2, 2 + pinsrw %1, [r13+%3+2], 3 + pextrw %3d, %2, 4 + pinsrw %1, [r13+%3+2], 5 + pextrw %3d, %2, 6 + pinsrw %1, [r13+%3+2], 7 + %else + movd %1, [base+sgr_x_by_x-0xf03+%3] + pextrw %3, %2, 2 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 + pextrw %3, %2, 4 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 + pextrw %3, %2, 6 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 + %endif +%endmacro + +%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore + %if ARCH_X86_64 + %define tmp r14 + %else + %define tmp %4 + %endif + GATHERDD %1, %2, tmp + GATHERDD %2, %3, tmp + movif32 %4, %5 + psrld %1, 24 + psrld %2, 24 + packssdw %1, %2 +%endmacro + +%macro MAXSD 3-4 0 ; dst, src, restore_tmp + pcmpgtd %3, %1, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 + %if %4 == 1 + pxor %3, %3 + %endif +%endmacro + +%macro MULLD 3 ; dst, src, tmp + pmulhuw %3, %1, %2 + pmullw %1, %2 + pslld %3, 16 + paddd %1, %3 +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 0, 1, 2, 3, 5 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 5*16 + %else + %assign extra_stack 3*16 + %endif +cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ + dst, stride, left, lpf, w + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*0+4*6] + %define stridemp dword [esp+calloff+16*0+4*7] + %define leftm dword [esp+calloff+16*3+4*0] + %define lpfm dword [esp+calloff+16*3+4*1] + %define w0m dword [esp+calloff+16*3+4*2] + %define hd dword [esp+calloff+16*3+4*3] + %define edgeb byte [esp+calloff+16*3+4*4] + %define edged dword [esp+calloff+16*3+4*4] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t0m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define m8 [base+pd_8] + %define m9 [base+pd_0xfffffff0] + %define m10 [esp+calloff+16*2] + %define m11 [base+pd_0xf00800a4] + %define m12 [base+sgr_lshuf5] + %define m13 [base+pd_34816] + %define m14 [base+pw_1023] + %define r10 r4 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov wd, [rstk+stack_offset+20] + mov dstm, dstq + mov stridemp, strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] + mov lpfm, lpfq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \ + w, h, edge, params +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, r6mp + lea r13, [sgr_x_by_x-0xf03] + movifnidn hd, hm + add wd, wd + mov edged, r7m + movu m10, [paramsq] + mova m12, [sgr_lshuf5] + add lpfq, wq + mova m8, [pd_8] + lea t1, [rsp+wq+20] + mova m9, [pd_0xfffffff0] + add dstq, wq + lea t3, [rsp+wq*2+400*12+16] + mova m11, [pd_0xf00800a4] + lea t4, [rsp+wq+400*20+16] + pshufhw m7, m10, q0000 + pshufb m10, [pw_256] ; s0 + punpckhqdq m7, m7 ; w0 + neg wq + mova m13, [pd_34816] ; (1 << 11) + (1 << 15) + pxor m6, m6 + mova m14, [pw_1023] + psllw m7, 4 + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+28] ; params + LEA r6, $$ + add wd, wd + movu m1, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*20+16] + mov t3m, t3 + pshufhw m7, m1, q0000 + mov t4m, t4 + pshufb m1, [base+pw_256] ; s0 + punpckhqdq m7, m7 ; w0 + psllw m7, 4 + neg wq + mova m10, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 4 + mov lpfq, lpfm + mov w0m, wd + %define strideq r5 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, stridemp + movif32 t2m, t1 + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov lpfm, r10 ; below + movif32 t0m, t2 + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, stridemp + movif32 t4, t4m + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, stridemp +%if ARCH_X86_64 + test hb, hb +%else + mov r4, hd + test r4, r4 +%endif + jz .odd_height + call .h + add lpfq, stridemp + call .hv + movif32 dstq, dstm + call .n0 + call .n1 + sub hd, 2 + movif32 t0, t0m + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .h_top + add lpfq, stridemp + call .hv_bottom +.end: + movif32 dstq, dstm + call .n0 + call .n1 +.end2: + RET +.height1: + movif32 t4, t4m + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + movif32 dstq, dstm + call .n0 + call .n1 +.odd_height_end: + call .v + movif32 dstq, dstm + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h + lea t2, [t1+400*6] + movif32 t2m, t2 + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + movif32 t0m, t0 + jmp .main +.no_top_height1: + movif32 t3, t3m + movif32 t4, t4m + call .v + call .prep_n + jmp .odd_height_end +.extend_right: + movd m0, wd + movd m1, [lpfq-2] + mova m2, [base+pw_256] + mova m3, [base+pb_m14_m13] + pshufb m0, m6 + pshufb m1, m2 + psubb m2, m0 + psubb m3, m0 + mova m0, [base+pb_0to15] + pcmpgtb m2, m0 + pcmpgtb m3, m0 + pand m4, m2 + pand m5, m3 + pandn m2, m1 + pandn m3, m1 + por m4, m2 + por m5, m3 + ret +%assign stack_offset stack_offset+4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r4-4] +%else + %define leftq r4 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m12 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r4-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq- 2] +.h_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m2, m5, m4, 2 + paddw m0, m4, m2 + palignr m3, m5, m4, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + paddd m1, m3 + punpckhwd m3, m4, m5 + pmaddwd m3, m3 + shufps m4, m5, q2121 + paddw m0, m4 ; sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m2, m3 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+wq+400*0] + paddd m1, [t1+wq+400*2] + paddd m2, [t1+wq+400*4] +.h_loop_end: + paddd m1, m5 ; sumsq + paddd m2, m4 + mova [t1+wq+400*0], m0 + mova [t1+wq+400*2], m1 + mova [t1+wq+400*4], m2 + add wq, 16 + jl .h_loop + ret +.top_fixup: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+wq+400*0] + mova m1, [t1+wq+400*2] + mova m2, [t1+wq+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m1 + mova [t2+wq+400*4], m2 + add wq, 16 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv_main +.hv_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m12 + jmp .hv_main +.hv_bottom: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv_loop_start +%endif +.hv_loop: + movif32 lpfq, hvsrcm +.hv_loop_start: + movu m4, [lpfq+wq- 2] +.hv_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -20 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t3, hd + palignr m3, m5, m4, 2 + paddw m0, m4, m3 + palignr m1, m5, m4, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m1, m4, m5 + pmaddwd m1, m1 + paddd m2, m1 + punpckhwd m1, m4, m5 + pmaddwd m1, m1 + shufps m4, m5, q2121 + paddw m0, m4 ; h sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m3, m1 + paddd m2, m5 ; h sumsq + paddd m3, m4 + paddw m1, m0, [t1+wq+400*0] + paddd m4, m2, [t1+wq+400*2] + paddd m5, m3, [t1+wq+400*4] +%if ARCH_X86_64 + test hd, hd +%else + test t3, t3 +%endif + jz .hv_last_row +.hv_main2: + paddw m1, [t2+wq+400*0] ; hv sum + paddd m4, [t2+wq+400*2] ; hv sumsq + paddd m5, [t2+wq+400*4] + mova [t0+wq+400*0], m0 + mova [t0+wq+400*2], m2 + mova [t0+wq+400*4], m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + pand m4, m9 ; ((a + 8) >> 4) << 4 + pand m5, m9 + psrld m2, m4, 4 + psrld m0, m5, 4 + paddd m2, m4 + psrld m4, 1 + paddd m0, m5 + psrld m5, 1 + paddd m4, m2 ; a * 25 + paddd m5, m0 + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m6 + MAXSD m5, m3, m6, 1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m10, m2 ; p * s + MULLD m5, m10, m2 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, t2, t2m + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m2 + MULLD m1, m5, m2 + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+wq+4], m3 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + movif32 t2m, t2 + movif32 t0m, t0 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+wq+400*0], m1 + paddw m1, m0 + mova [t1+wq+400*2], m4 + paddd m4, m2 + mova [t1+wq+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.v_loop: + mova m0, [t1+wq+400*0] + mova m2, [t1+wq+400*2] + mova m3, [t1+wq+400*4] + paddw m1, m0, [t2+wq+400*0] + paddd m4, m2, [t2+wq+400*2] + paddd m5, m3, [t2+wq+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + psrlw m3, m1, 1 + paddd m4, m8 + pavgw m3, m6 ; (b + 2) >> 2 + paddd m5, m8 + pand m4, m9 ; ((a + 8) >> 4) << 4 + pand m5, m9 + psrld m2, m4, 4 + psrld m0, m5, 4 + paddd m2, m4 + psrld m4, 1 + paddd m0, m5 + psrld m5, 1 + paddd m4, m2 ; a * 25 + paddd m5, m0 + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + pmaddwd m2, m2 ; b * b + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m6 + MAXSD m5, m3, m6, 1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m10, m2 ; p * s + MULLD m5, m10, m2 + pmaddwd m0, m11 ; b * 164 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, t2, t2m + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m2 + MULLD m1, m5, m2 + paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m13 + mova [t4+wq+4], m3 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + movif64 wq, r4 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+ 2] + movu m3, [t4+wq*1+ 4] + movu m1, [t3+wq*2+ 4] + movu m4, [t3+wq*2+ 8] + movu m2, [t3+wq*2+20] + movu m5, [t3+wq*2+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*1+ 0] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + mova [t4+wq*1+400*2+ 0], m0 + mova [t3+wq*2+400*4+ 0], m1 + mova [t3+wq*2+400*4+16], m2 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r4 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*1+ 2] + movu m3, [t4+wq*1+ 4] + movu m1, [t3+wq*2+ 4] + movu m4, [t3+wq*2+ 8] + movu m2, [t3+wq*2+20] + movu m5, [t3+wq*2+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*1+ 0] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + paddw m3, m0, [t4+wq*1+400*2+ 0] + paddd m4, m1, [t3+wq*2+400*4+ 0] + paddd m5, m2, [t3+wq*2+400*4+16] + mova [t4+wq*1+400*2+ 0], m0 + mova [t3+wq*2+400*4+ 0], m1 + mova [t3+wq*2+400*4+16], m2 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r4 + movif32 wd, w1m +.n1_loop: + mova m0, [dstq+wq] + mova m3, [t4+wq*1+400*2+ 0] + mova m4, [t3+wq*2+400*4+ 0] + mova m5, [t3+wq*2+400*4+16] + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 7) + psubd m5, m3 + psrad m4, 8 + psrad m5, 8 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m14 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 4*16 + %else + %assign extra_stack 2*16 + %endif +cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ + dst, stride, left, lpf, w + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*2+4*0] + %define stridemp dword [esp+calloff+16*2+4*1] + %define leftm dword [esp+calloff+16*2+4*2] + %define lpfm dword [esp+calloff+16*2+4*3] + %define w0m dword [esp+calloff+16*2+4*4] + %define hd dword [esp+calloff+16*2+4*5] + %define edgeb byte [esp+calloff+16*2+4*6] + %define edged dword [esp+calloff+16*2+4*6] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %define m8 [base+pd_8] + %define m9 [esp+calloff+16*1] + %define m10 [base+pd_0xf00801c7] + %define m11 [base+pd_34816] + %define m12 [base+sgr_lshuf3] + %define m13 [base+pw_1023] + %define m14 m6 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov wd, [rstk+stack_offset+20] + mov dstm, dstq + mov stridemp, strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] + mov lpfm, lpfq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \ + w, h, edge, params +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, r6mp + lea r13, [sgr_x_by_x-0xf03] + movifnidn hd, hm + add wd, wd + mov edged, r7m + movq m9, [paramsq+4] + add lpfq, wq + lea t1, [rsp+wq+12] + mova m8, [pd_8] + add dstq, wq + lea t3, [rsp+wq*2+400*12+8] + mova m10, [pd_0xf00801c7] + lea t4, [rsp+wq+400*32+8] + mova m11, [pd_34816] + pshuflw m7, m9, q3333 + pshufb m9, [pw_256] ; s1 + punpcklqdq m7, m7 ; w1 + neg wq + pxor m6, m6 + mova m13, [pw_1023] + psllw m7, 4 + mova m12, [sgr_lshuf3] + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+28] ; params + LEA r6, $$ + add wd, wd + movq m1, [r1+4] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*32+16] + mov t3m, t3 + pshuflw m7, m1, q3333 + mov t4m, t4 + pshufb m1, [base+pw_256] ; s1 + punpcklqdq m7, m7 ; w1 + psllw m7, 4 + neg wq + mova m9, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 4 + mov lpfq, lpfm + mov w0m, wd + %define strideq r5 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, stridemp + mov t2, t1 + add t1, 400*6 + call .h_top + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv0 +%if ARCH_X86_64 + test hb, hb +%else + mov r4, hd + test r4, r4 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*6] +.top_fixup_loop: + mova m0, [t1+wq+400*0] + mova m1, [t1+wq+400*2] + mova m2, [t1+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m1 + mova [t2+wq+400*4], m2 + add wq, 16 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.extend_right: + movd m1, wd + movd m5, [lpfq-2] + mova m2, [base+pw_256] + mova m3, [base+pb_0to15] + pshufb m1, m6 + pshufb m5, m2 + psubb m2, m1 + pcmpgtb m2, m3 + pand m4, m2 + pandn m2, m5 + por m4, m2 + ret +%assign stack_offset stack_offset+4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r4-4] +%else + %define leftq r4 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m12 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r4-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq+ 0] +.h_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -18 + jl .h_have_right + call .extend_right +.h_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + mova [t1+wq+400*0], m1 + mova [t1+wq+400*2], m2 + mova [t1+wq+400*4], m3 + add wq, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m12 + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m4, [lpfq+wq+ 0] +.hv0_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -18 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + paddw m0, m1, [t1+wq+400*0] + paddd m4, m2, [t1+wq+400*2] + paddd m5, m3, [t1+wq+400*4] + mova [t1+wq+400*0], m1 + mova [t1+wq+400*2], m2 + mova [t1+wq+400*4], m3 + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq+4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+ 8], m0 + mova [t3+wq*2+24], m1 + add wq, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 12 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, m12 + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m4, [lpfq+wq+ 0] +.hv1_main: + movu m5, [lpfq+wq+16] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -18 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + palignr m1, m5, m4, 2 + paddw m0, m4, m1 + punpcklwd m2, m4, m1 + pmaddwd m2, m2 + punpckhwd m3, m4, m1 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m0, m5 ; h sum + punpcklwd m1, m5, m6 + pmaddwd m1, m1 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m1 ; h sumsq + paddd m3, m5 + paddw m1, m0, [t2+wq+400*0] + paddd m4, m2, [t2+wq+400*2] + paddd m5, m3, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m2 + mova [t2+wq+400*4], m3 + paddd m4, m8 + paddd m5, m8 + psrld m4, 4 ; (a + 8) >> 4 + psrld m5, 4 + pslld m2, m4, 3 + pslld m3, m5, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*2 +4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*0+ 4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m1 + add wq, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.v1_loop: + mova m0, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + paddw m1, m0, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m5, [t2+wq+400*4] + mova [t2+wq+400*0], m0 + mova [t2+wq+400*2], m4 + mova [t2+wq+400*4], m5 + paddd m2, m8 + paddd m3, m8 + psrld m2, 4 ; (a + 8) >> 4 + psrld m3, 4 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m6 ; (b + 2) >> 2 + punpcklwd m2, m3, m6 + pmaddwd m2, m2 + punpckhwd m3, m6 + pmaddwd m3, m3 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + MAXSD m4, m2, m14 + MAXSD m5, m3, m14 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m14 ; p * s + MULLD m5, m9, m14 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m14 + MULLD m1, m5, m14 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*1+400*2+ 4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r4 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+400*0+ 4] + movu m1, [t3+wq*2+400*0+ 8] + movu m2, [t3+wq*2+400*0+24] + movu m3, [t4+wq*1+400*0+ 2] + movu m4, [t3+wq*2+400*0+ 4] + movu m5, [t3+wq*2+400*0+20] + paddw m0, [t4+wq*1+400*0+ 0] + paddd m1, [t3+wq*2+400*0+ 0] + paddd m2, [t3+wq*2+400*0+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[-1] 444 + pslld m4, 2 ; b[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a[-1] 343 + psubd m4, m1 ; b[-1] 343 + psubd m5, m2 + mova [t4+wq*1+400*4], m3 + mova [t3+wq*2+400*8+ 0], m4 + mova [t3+wq*2+400*8+16], m5 + movu m0, [t4+wq*1+400*2+ 4] + movu m1, [t3+wq*2+400*4+ 8] + movu m2, [t3+wq*2+400*4+24] + movu m3, [t4+wq*1+400*2+ 2] + movu m4, [t3+wq*2+400*4+ 4] + movu m5, [t3+wq*2+400*4+20] + paddw m0, [t4+wq*1+400*2+ 0] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m2, [t3+wq*2+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[ 0] 444 + pslld m4, 2 ; b[ 0] 444 + pslld m5, 2 + mova [t4+wq*1+400* 6], m3 + mova [t3+wq*2+400*12+ 0], m4 + mova [t3+wq*2+400*12+16], m5 + psubw m3, m0 ; a[ 0] 343 + psubd m4, m1 ; b[ 0] 343 + psubd m5, m2 + mova [t4+wq*1+400* 8], m3 + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m5 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r4 + movif32 wd, w1m +.n0_loop: + movu m3, [t4+wq*1+400*0+4] + movu m1, [t4+wq*1+400*0+2] + paddw m3, [t4+wq*1+400*0+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*1+400*4] + paddw m3, [t4+wq*1+400*6] + mova [t4+wq*1+400*4], m2 + mova [t4+wq*1+400*6], m1 + movu m4, [t3+wq*2+400*0+8] + movu m1, [t3+wq*2+400*0+4] + paddd m4, [t3+wq*2+400*0+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*2+400* 8+ 0] + paddd m4, [t3+wq*2+400*12+ 0] + mova [t3+wq*2+400* 8+ 0], m2 + mova [t3+wq*2+400*12+ 0], m1 + movu m5, [t3+wq*2+400*0+24] + movu m1, [t3+wq*2+400*0+20] + paddd m5, [t3+wq*2+400*0+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*2+400* 8+16] + paddd m5, [t3+wq*2+400*12+16] + mova [t3+wq*2+400* 8+16], m2 + mova [t3+wq*2+400*12+16], m1 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r4 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*1+400*2+4] + movu m1, [t4+wq*1+400*2+2] + paddw m3, [t4+wq*1+400*2+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*1+400*6] + paddw m3, [t4+wq*1+400*8] + mova [t4+wq*1+400*6], m1 + mova [t4+wq*1+400*8], m2 + movu m4, [t3+wq*2+400*4+8] + movu m1, [t3+wq*2+400*4+4] + paddd m4, [t3+wq*2+400*4+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*2+400*12+ 0] + paddd m4, [t3+wq*2+400*16+ 0] + mova [t3+wq*2+400*12+ 0], m1 + mova [t3+wq*2+400*16+ 0], m2 + movu m5, [t3+wq*2+400*4+24] + movu m1, [t3+wq*2+400*4+20] + paddd m5, [t3+wq*2+400*4+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*2+400*12+16] + paddd m5, [t3+wq*2+400*16+16] + mova [t3+wq*2+400*12+16], m1 + mova [t3+wq*2+400*16+16], m2 + mova m0, [dstq+wq] + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + pmaxsw m0, m6 + pminsw m0, m13 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 10*16 + %else + %assign extra_stack 8*16 + %endif +cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ + dst, stride, left, lpf, w + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*8+4*0] + %define stridemp dword [esp+calloff+16*8+4*1] + %define leftm dword [esp+calloff+16*8+4*2] + %define lpfm dword [esp+calloff+16*8+4*3] + %define w0m dword [esp+calloff+16*8+4*4] + %define hd dword [esp+calloff+16*8+4*5] + %define edgeb byte [esp+calloff+16*8+4*6] + %define edged dword [esp+calloff+16*8+4*6] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %xdefine m8 m6 + %define m9 [base+pd_8] + %define m10 [base+pd_34816] + %define m11 [base+pd_0xf00801c7] + %define m12 [base+pd_0xf00800a4] + %define m13 [esp+calloff+16*4] + %define m14 [esp+calloff+16*5] + %define m15 [esp+calloff+16*6] + %define m6 [esp+calloff+16*7] + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov wd, [rstk+stack_offset+20] + mov dstm, dstq + mov stridemp, strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] + mov lpfm, lpfq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ + w, h, edge, params +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + movifnidn wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, r6mp + lea r13, [sgr_x_by_x-0xf03] + movifnidn hd, hm + add wd, wd + mov edged, r7m + mova m14, [paramsq] + add lpfq, wq + mova m9, [pd_8] + lea t1, [rsp+wq+44] + mova m10, [pd_34816] + add dstq, wq + mova m11, [pd_0xf00801c7] + lea t3, [rsp+wq*2+400*24+40] + mova m12, [pd_0xf00800a4] + lea t4, [rsp+wq+400*52+40] + neg wq + pshufd m15, m14, q2222 ; w0 w1 + punpcklwd m14, m14 + pshufd m13, m14, q0000 ; s0 + pshufd m14, m14, q2222 ; s1 + pxor m6, m6 + psllw m15, 2 + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+28] ; params + LEA r6, $$ + add wd, wd + mova m2, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq+52] + add dstq, wq + lea t3, [rsp+extra_stack+wq*2+400*24+48] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq+400*52+48] + mov t3m, t3 + mov t4m, t4 + neg wq + pshuflw m0, m2, q0000 + pshuflw m1, m2, q2222 + pshufhw m2, m2, q1010 + punpcklqdq m0, m0 ; s0 + punpcklqdq m1, m1 ; s1 + punpckhqdq m2, m2 ; w0 w1 + mov w1m, wd + pxor m3, m3 + psllw m2, 2 + mova m13, m0 + mova m14, m1 + sub wd, 4 + mova m15, m2 + mova m6, m3 + mov lpfq, lpfm + mov w0m, wd + %define strideq r5 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, stridemp + mov t2, t1 +%if ARCH_X86_64 + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup +%else + mov wq, w0m + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop +%endif + add t1, 400*12 + call .h_top + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv0 +%if ARCH_X86_64 + test hd, hd +%else + mov r4, hd + test r4, r4 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*12] +.top_fixup_loop: + mova m0, [t1+wq+400* 0] + mova m1, [t1+wq+400* 2] + mova m2, [t1+wq+400* 4] + paddw m0, m0 + mova m3, [t1+wq+400* 6] + paddd m1, m1 + mova m4, [t1+wq+400* 8] + paddd m2, m2 + mova m5, [t1+wq+400*10] + mova [t2+wq+400* 0], m0 + mova [t2+wq+400* 2], m1 + mova [t2+wq+400* 4], m2 + mova [t2+wq+400* 6], m3 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + add wq, 16 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.h: ; horizontal boxsum +%assign stack_offset stack_offset+4 +%assign calloff 4 +%if ARCH_X86_64 + lea wq, [r4-4] +%else + %define leftq r4 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r4-4] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m4, [lpfq+wq- 2] +.h_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -20 + jl .h_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.h_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; sum3 + punpcklwd m7, m0, m6 + pmaddwd m7, m7 + punpckhwd m0, m6 + pmaddwd m0, m0 + paddd m2, m7 ; sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + mova [t1+wq+400* 6], m1 + mova [t1+wq+400* 8], m2 + mova [t1+wq+400*10], m3 + paddw m8, m1 ; sum5 + paddd m7, m2 ; sumsq5 + paddd m5, m3 + mova [t1+wq+400* 0], m8 + mova [t1+wq+400* 2], m7 + mova [t1+wq+400* 4], m5 + add wq, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m4, [lpfq+wq- 2] +.hv0_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -20 + jl .hv0_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.hv0_have_right: + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + movif32 t3, t3m + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; h sum3 + punpcklwd m7, m0, m6 + pmaddwd m7, m7 + punpckhwd m0, m6 + pmaddwd m0, m0 + paddd m2, m7 ; h sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + paddw m8, m1 ; h sum5 + paddd m7, m2 ; h sumsq5 + paddd m5, m3 + mova [t3+wq*2+400*8+ 8], m8 + mova [t3+wq*2+400*0+ 8], m7 + mova [t3+wq*2+400*0+24], m5 + paddw m8, [t1+wq+400* 0] + paddd m7, [t1+wq+400* 2] + paddd m5, [t1+wq+400* 4] + mova [t1+wq+400* 0], m8 + mova [t1+wq+400* 2], m7 + mova [t1+wq+400* 4], m5 + paddw m0, m1, [t1+wq+400* 6] + paddd m4, m2, [t1+wq+400* 8] + paddd m5, m3, [t1+wq+400*10] + mova [t1+wq+400* 6], m1 + mova [t1+wq+400* 8], m2 + mova [t1+wq+400*10], m3 + paddw m1, m0, [t2+wq+400* 6] + paddd m2, m4, [t2+wq+400* 8] + paddd m3, m5, [t2+wq+400*10] + mova [t2+wq+400* 6], m0 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z3, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m7 + MULLD m1, m5, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*2+ 4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m5, [leftq] + movif32 wq, w0m + mova m4, [lpfq+wq+4] + add leftmp, 8 + palignr m4, m5, 10 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m4, [lpfq+wq+4] + pshufb m4, [base+sgr_lshuf5] + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m4, [lpfq+wq- 2] +.hv1_main: + movu m5, [lpfq+wq+14] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -20 + jl .hv1_have_right +%if ARCH_X86_32 + pxor m8, m8 +%endif + call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right +.hv1_have_right: + palignr m7, m5, m4, 2 + palignr m3, m5, m4, 4 + paddw m2, m7, m3 + punpcklwd m0, m7, m3 + pmaddwd m0, m0 + punpckhwd m7, m3 + pmaddwd m7, m7 + palignr m3, m5, m4, 6 + paddw m2, m3 ; h sum3 + punpcklwd m1, m3, m6 + pmaddwd m1, m1 + punpckhwd m3, m6 + pmaddwd m3, m3 + paddd m0, m1 ; h sumsq3 + palignr m5, m4, 8 + punpckhwd m1, m4, m5 + paddw m8, m4, m5 + pmaddwd m1, m1 + punpcklwd m4, m5 + pmaddwd m4, m4 + paddd m7, m3 + paddw m5, m2, [t2+wq+400* 6] + mova [t2+wq+400* 6], m2 + paddw m8, m2 ; h sum5 + paddd m2, m0, [t2+wq+400* 8] + paddd m3, m7, [t2+wq+400*10] + mova [t2+wq+400* 8], m0 + mova [t2+wq+400*10], m7 + paddd m4, m0 ; h sumsq5 + paddd m1, m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 + pslld m0, m2, 3 + pslld m7, m3, 3 + paddd m2, m0 ; ((a3 + 8) >> 4) * 9 + paddd m3, m7 + psrlw m7, m5, 1 + pavgw m7, m6 ; (b3 + 2) >> 2 + punpcklwd m0, m7, m6 + pmaddwd m0, m0 + punpckhwd m7, m6 + pmaddwd m7, m7 +%if ARCH_X86_32 + mova [esp+20], m8 +%else + SWAP m8, m6 +%endif + MAXSD m2, m0, m8 + MAXSD m3, m7, m8 + pxor m8, m8 + psubd m2, m0 ; p3 + psubd m3, m7 + punpcklwd m0, m5, m8 ; b3 + punpckhwd m5, m8 + MULLD m2, m14, m8 ; p3 * s1 + MULLD m3, m14, m8 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m5, m11 + paddusw m2, m11 + paddusw m3, m11 + psrld m2, 20 ; min(z3, 255) + movif32 t3, t3m + psrld m3, 20 + GATHER_X_BY_X m8, m2, m3, r0, dstm + punpcklwd m2, m8, m8 + punpckhwd m3, m8, m8 + MULLD m0, m2, m7 + MULLD m5, m3, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m5, m10 + psrld m0, 12 + psrld m5, 12 + mova [t4+wq*1+400*4+4], m8 + mova [t3+wq*2+400*8+ 8], m0 + mova [t3+wq*2+400*8+24], m5 +%if ARCH_X86_32 + mova m8, [esp+20] +%else + SWAP m6, m8 + pxor m6, m6 +%endif + paddw m5, m8, [t2+wq+400*0] + paddd m2, m4, [t2+wq+400*2] + paddd m3, m1, [t2+wq+400*4] + paddw m5, [t1+wq+400*0] + paddd m2, [t1+wq+400*2] + paddd m3, [t1+wq+400*4] + mova [t2+wq+400*0], m8 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + mova [t2+wq+400*2], m4 + pslld m8, m2, 4 + mova [t2+wq+400*4], m1 + pslld m4, m3, 4 + paddd m8, m2 + pslld m2, 3 + paddd m4, m3 + pslld m3, 3 + paddd m2, m8 ; ((a5 + 8) >> 4) * 25 + paddd m3, m4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + psrlw m1, m5, 1 + pavgw m1, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m1, m7 + pmaddwd m4, m4 + punpckhwd m1, m7 + pmaddwd m1, m1 + punpcklwd m0, m5, m7 ; b5 + punpckhwd m5, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m2, m4, m7 + psubd m2, m4 ; p5 + MAXSD m3, m1, m7 + psubd m3, m1 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m12 ; b5 * 164 + pmaddwd m5, m12 + paddusw m2, m12 + paddusw m3, m12 + psrld m2, 20 ; min(z5, 255) + psrld m3, 20 + GATHER_X_BY_X m1, m2, m3, r0, dstm + punpcklwd m2, m1, m1 + punpckhwd m3, m1, m1 + MULLD m0, m2, m7 + MULLD m5, m3, m7 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m5, m10 + mova [t4+wq*1+400*0+ 4], m1 + psrld m0, 12 + psrld m5, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m5 + add wq, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq+400* 6] + mova m4, [t1+wq+400* 8] + mova m5, [t1+wq+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq+400* 6] + paddd m2, m4, [t2+wq+400* 8] + paddd m3, m5, [t2+wq+400*10] + mova [t2+wq+400* 6], m0 + mova [t2+wq+400* 8], m4 + mova [t2+wq+400*10], m5 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z3, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m7 + MULLD m1, m5, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*2+4], m3 + psrld m0, 12 + psrld m1, 12 + mova m3, [t1+wq+400*0] + mova m4, [t1+wq+400*2] + mova m5, [t1+wq+400*4] + mova [t3+wq*2+400*8+ 8], m3 + mova [t3+wq*2+400*0+ 8], m4 + mova [t3+wq*2+400*0+24], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+wq+400*0], m3 + mova [t1+wq+400*2], m4 + mova [t1+wq+400*4], m5 + mova [t3+wq*2+400*4+ 8], m0 + mova [t3+wq*2+400*4+24], m1 + add wq, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-4] +%else + mov wd, w0m +%endif +.v1_loop: + mova m4, [t1+wq+400* 6] + mova m5, [t1+wq+400* 8] + mova m7, [t1+wq+400*10] + paddw m1, m4, [t2+wq+400* 6] + paddd m2, m5, [t2+wq+400* 8] + paddd m3, m7, [t2+wq+400*10] + mova [t2+wq+400* 6], m4 + mova [t2+wq+400* 8], m5 + mova [t2+wq+400*10], m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a3 + 8) >> 4 + psrld m3, 4 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + psrlw m3, m1, 1 + pavgw m3, m7 ; (b3 + 2) >> 2 + punpcklwd m2, m3, m7 + pmaddwd m2, m2 + punpckhwd m3, m7 + pmaddwd m3, m3 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m4, m2, m7 + MAXSD m5, m3, m7 + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z3, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m7 + MULLD m1, m5, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*4+4], m3 + psrld m0, 12 + psrld m8, m1, 12 + mova m4, [t3+wq*2+400*8+ 8] + mova m5, [t3+wq*2+400*0+ 8] + mova m7, [t3+wq*2+400*0+24] + paddw m1, m4, [t2+wq+400*0] + paddd m2, m5, [t2+wq+400*2] + paddd m3, m7, [t2+wq+400*4] + paddw m1, [t1+wq+400*0] + paddd m2, [t1+wq+400*2] + paddd m3, [t1+wq+400*4] + mova [t2+wq+400*0], m4 + mova [t2+wq+400*2], m5 + mova [t2+wq+400*4], m7 + paddd m2, m9 + paddd m3, m9 + psrld m2, 4 ; (a5 + 8) >> 4 + psrld m3, 4 + mova [t3+wq*2+400*8+ 8], m0 + pslld m4, m2, 4 + mova [t3+wq*2+400*8+24], m8 + pslld m5, m3, 4 + paddd m4, m2 + pslld m2, 3 + paddd m5, m3 + pslld m3, 3 + paddd m2, m4 + paddd m3, m5 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + psrlw m5, m1, 1 + pavgw m5, m7 ; (b5 + 2) >> 2 + punpcklwd m4, m5, m7 + pmaddwd m4, m4 + punpckhwd m5, m7 + pmaddwd m5, m5 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MAXSD m2, m4, m7 + psubd m2, m4 ; p5 + MAXSD m3, m5, m7 + psubd m3, m5 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m12 ; b5 * 164 + pmaddwd m1, m12 + paddusw m2, m12 + paddusw m3, m12 + psrld m2, 20 ; min(z5, 255) + psrld m3, 20 + GATHER_X_BY_X m4, m2, m3, r0, dstm + punpcklwd m2, m4, m4 + punpckhwd m3, m4, m4 + MULLD m0, m2, m7 + MULLD m1, m3, m7 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*1+400*0+ 4], m4 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*2+400*0+ 8], m0 + mova [t3+wq*2+400*0+24], m1 + add wq, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r4 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*1+400*0+ 2] + movu m1, [t3+wq*2+400*0+ 4] + movu m2, [t3+wq*2+400*0+20] + movu m7, [t4+wq*1+400*0+ 4] + movu m8, [t3+wq*2+400*0+ 8] + paddw m3, m0, [t4+wq*1+400*0+ 0] + paddd m4, m1, [t3+wq*2+400*0+ 0] + paddd m5, m2, [t3+wq*2+400*0+16] + paddw m3, m7 + paddd m4, m8 + movu m7, [t3+wq*2+400*0+24] + paddw m0, m3 + paddd m1, m4 + psllw m3, 2 + pslld m4, 2 + paddd m5, m7 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a5 565 + paddd m1, m4 ; b5 565 + paddd m2, m5 + mova [t4+wq*1+400* 6+ 0], m0 + mova [t3+wq*2+400*12+ 0], m1 + mova [t3+wq*2+400*12+16], m2 + movu m0, [t4+wq*1+400*2+ 4] + movu m1, [t3+wq*2+400*4+ 8] + movu m2, [t3+wq*2+400*4+24] + movu m3, [t4+wq*1+400*2+ 2] + movu m4, [t3+wq*2+400*4+ 4] + movu m5, [t3+wq*2+400*4+20] + paddw m0, [t4+wq*1+400*2+ 0] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m2, [t3+wq*2+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[-1] 444 + pslld m4, 2 ; b3[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a3[-1] 343 + psubd m4, m1 ; b3[-1] 343 + psubd m5, m2 + mova [t4+wq*1+400* 8+ 0], m3 + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m5 + movu m0, [t4+wq*1+400*4+ 4] + movu m1, [t3+wq*2+400*8+ 8] + movu m2, [t3+wq*2+400*8+24] + movu m3, [t4+wq*1+400*4+ 2] + movu m4, [t3+wq*2+400*8+ 4] + movu m5, [t3+wq*2+400*8+20] + paddw m0, [t4+wq*1+400*4+ 0] + paddd m1, [t3+wq*2+400*8+ 0] + paddd m2, [t3+wq*2+400*8+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[ 0] 444 + pslld m4, 2 ; b3[ 0] 444 + pslld m5, 2 + mova [t4+wq*1+400*10+ 0], m3 + mova [t3+wq*2+400*20+ 0], m4 + mova [t3+wq*2+400*20+16], m5 + psubw m3, m0 ; a3[ 0] 343 + psubd m4, m1 ; b3[ 0] 343 + psubd m5, m2 + mova [t4+wq*1+400*12+ 0], m3 + mova [t3+wq*2+400*24+ 0], m4 + mova [t3+wq*2+400*24+16], m5 + add wq, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r4 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*1+ 4] + movu m2, [t4+wq*1+ 2] + paddw m0, [t4+wq*1+ 0] + paddw m0, m2 + paddw m2, m0 + psllw m0, 2 + paddw m0, m2 ; a5 + movu m4, [t3+wq*2+ 8] + movu m5, [t3+wq*2+24] + movu m1, [t3+wq*2+ 4] + movu m3, [t3+wq*2+20] + paddd m4, [t3+wq*2+ 0] + paddd m5, [t3+wq*2+16] + paddd m4, m1 + paddd m5, m3 + paddd m1, m4 + paddd m3, m5 + pslld m4, 2 + pslld m5, 2 + paddd m4, m1 ; b5 + paddd m5, m3 + movu m2, [t4+wq*1+400* 6] + paddw m2, m0 + mova [t4+wq*1+400* 6], m0 + paddd m0, m4, [t3+wq*2+400*12+ 0] + paddd m1, m5, [t3+wq*2+400*12+16] + mova [t3+wq*2+400*12+ 0], m4 + mova [t3+wq*2+400*12+16], m5 + mova [rsp+16+ARCH_X86_32*4], m1 + movu m3, [t4+wq*1+400*2+4] + movu m5, [t4+wq*1+400*2+2] + paddw m3, [t4+wq*1+400*2+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + movu m3, [t4+wq*1+400* 8] + paddw m3, [t4+wq*1+400*10] + paddw m3, m4 + mova [t4+wq*1+400* 8], m4 + mova [t4+wq*1+400*10], m5 + movu m1, [t3+wq*2+400*4+ 8] + movu m5, [t3+wq*2+400*4+ 4] + movu m7, [t3+wq*2+400*4+24] + movu m8, [t3+wq*2+400*4+20] + paddd m1, [t3+wq*2+400*4+ 0] + paddd m7, [t3+wq*2+400*4+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 +%if ARCH_X86_32 + mova [esp+52], m8 + psubd m8, m7 +%else + psubd m6, m8, m7 + SWAP m8, m6 +%endif + paddd m1, m4, [t3+wq*2+400*16+ 0] + paddd m7, m8, [t3+wq*2+400*16+16] + paddd m1, [t3+wq*2+400*20+ 0] + paddd m7, [t3+wq*2+400*20+16] + mova [t3+wq*2+400*16+ 0], m4 + mova [t3+wq*2+400*16+16], m8 + mova [t3+wq*2+400*20+ 0], m5 +%if ARCH_X86_32 + mova m8, [esp+52] +%else + SWAP m8, m6 + pxor m6, m6 +%endif + mova [t3+wq*2+400*20+16], m8 + mova [rsp+32+ARCH_X86_32*4], m7 + movu m5, [dstq+wq] + punpcklwd m4, m5, m6 + punpcklwd m7, m2, m6 + pmaddwd m7, m4 ; a5 * src + punpcklwd m8, m3, m6 + pmaddwd m8, m4 ; a3 * src + punpckhwd m5, m6 + punpckhwd m2, m6 + pmaddwd m2, m5 + punpckhwd m3, m6 + pmaddwd m3, m5 + pslld m4, 13 + pslld m5, 13 + psubd m0, m7 ; b5 - a5 * src + (1 << 8) + psubd m1, m8 ; b3 - a3 * src + (1 << 8) + mova m7, [base+pd_0xffff] + psrld m0, 9 + pslld m1, 7 + pand m0, m7 + pandn m8, m7, m1 + por m0, m8 + mova m1, [rsp+16+ARCH_X86_32*4] + mova m8, [rsp+32+ARCH_X86_32*4] + psubd m1, m2 + psubd m8, m3 + mova m2, [base+pd_4096] + psrld m1, 9 + pslld m8, 7 + pand m1, m7 + pandn m7, m8 + por m1, m7 + pmaddwd m0, m15 + pmaddwd m1, m15 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + paddd m4, m2 + paddd m5, m2 + paddd m0, m4 + paddd m1, m5 + psrad m0, 8 + psrad m1, 8 + packssdw m0, m1 ; clip + pmaxsw m0, m7 + psrlw m0, 5 + mova [dstq+wq], m0 + add wq, 16 + jl .n0_loop + add dstq, stridemp + ret +%if ARCH_X86_64 + SWAP m6, m7 +%endif +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r4 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*1+400*4+4] + movu m5, [t4+wq*1+400*4+2] + paddw m3, [t4+wq*1+400*4+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + paddw m3, m4, [t4+wq*1+400*12] + paddw m3, [t4+wq*1+400*10] + mova [t4+wq*1+400*10], m5 + mova [t4+wq*1+400*12], m4 + movu m1, [t3+wq*2+400*8+ 8] + movu m5, [t3+wq*2+400*8+ 4] + movu m7, [t3+wq*2+400*8+24] + movu m8, [t3+wq*2+400*8+20] + paddd m1, [t3+wq*2+400*8+ 0] + paddd m7, [t3+wq*2+400*8+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 + psubd m0, m8, m7 + paddd m1, m4, [t3+wq*2+400*24+ 0] + paddd m7, m0, [t3+wq*2+400*24+16] + paddd m1, [t3+wq*2+400*20+ 0] + paddd m7, [t3+wq*2+400*20+16] + mova [t3+wq*2+400*20+ 0], m5 + mova [t3+wq*2+400*20+16], m8 + mova [t3+wq*2+400*24+ 0], m4 + mova [t3+wq*2+400*24+16], m0 + mova m5, [dstq+wq] + mova m2, [t4+wq*1+400* 6] + punpcklwd m4, m5, m6 + punpcklwd m8, m2, m6 + pmaddwd m8, m4 ; a5 * src + punpcklwd m0, m3, m6 + pmaddwd m0, m4 ; a3 * src + punpckhwd m5, m6 + punpckhwd m2, m6 + pmaddwd m2, m5 + punpckhwd m3, m6 + pmaddwd m3, m5 + psubd m1, m0 ; b3 - a3 * src + (1 << 8) + pslld m4, 13 + pslld m5, 13 + mova m0, [t3+wq*2+400*12+ 0] + psubd m0, m8 ; b5 - a5 * src + (1 << 8) + mova m8, [t3+wq*2+400*12+16] + psubd m8, m2 + psubd m7, m3 + mova m2, [base+pd_0xffff] + pslld m1, 7 + psrld m0, 8 + psrld m8, 8 + pslld m7, 7 + pand m0, m2 + pandn m3, m2, m1 + por m0, m3 + pand m8, m2 + pandn m2, m7 + por m2, m8 + mova m1, [base+pd_4096] + pmaddwd m0, m15 + pmaddwd m2, m15 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + pxor m7, m7 + paddd m4, m1 + paddd m5, m1 + paddd m0, m4 + paddd m2, m5 + psrad m0, 8 + psrad m2, 8 + packssdw m0, m2 ; clip + pmaxsw m0, m7 + psrlw m0, 5 + mova [dstq+wq], m0 + add wq, 16 + jl .n1_loop + add dstq, stridemp + movif32 dstm, dstq + ret diff --git a/third_party/dav1d/src/x86/looprestoration_avx2.asm b/third_party/dav1d/src/x86/looprestoration_avx2.asm new file mode 100644 index 0000000000..a73cb21882 --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration_avx2.asm @@ -0,0 +1,2237 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 +wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 +sgr_r_ext: times 16 db 1 + times 16 db 9 + +; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of +; cache but eliminates some shifts in the inner sgr loop which is overall a win +const sgr_x_by_x_avx2 + dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 + dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 + dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 + dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 + dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3 + dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 + dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 + + times 4 db -1 ; needed for 16-bit sgr +pb_m5: times 4 db -5 +pb_3: times 4 db 3 +pw_5_6: dw 5, 6 + +sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 + db 9, -1, 10, -1, 11, -1, 12, -1 + +pw_256: times 2 dw 256 +pw_2056: times 2 dw 2056 +pw_m16380: times 2 dw -16380 +pd_25: dd 25 +pd_34816: dd 34816 +pd_m4096: dd -4096 +pd_0xf00801c7: dd 0xf00801c7 +pd_0xf00800a4: dd 0xf00800a4 + +SECTION .text + +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers + +INIT_YMM avx2 +cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m + mov wd, wm + vbroadcasti128 m6, [wiener_shufA] + vpbroadcastb m11, [fltq+ 0] ; x0 x0 + vbroadcasti128 m7, [wiener_shufB] + vpbroadcastd m12, [fltq+ 2] + vbroadcasti128 m8, [wiener_shufC] + packsswb m12, m12 ; x1 x2 + vpbroadcastw m13, [fltq+ 6] ; x3 + vbroadcasti128 m9, [sgr_shuf+6] + add lpfq, wq + vpbroadcastd m10, [pw_m16380] + vpbroadcastd m14, [fltq+16] ; y0 y1 + add dstq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + lea t1, [rsp+wq*2+16] + psllw m14, 5 + neg wq + psllw m15, 5 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.extend_right: + movd xm2, r10d + vpbroadcastd m0, [pb_3] + vpbroadcastd m1, [pb_m5] + vpbroadcastb m2, xm2 + movu m3, [pb_0to31] + psubb m0, m2 + psubb m1, m2 + pminub m0, m3 + pminub m1, m3 + pshufb m4, m0 + pshufb m5, m1 + ret +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .h_main +.h_extend_left: + vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + palignr m4, m5, 12 + pshufb m4, [wiener_l_shuf] + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10-4] +.h_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m11 + pshufb m1, m5, m6 + pmaddubsw m1, m11 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + paddw m0, m2 + pshufb m2, m4, m8 + pmaddubsw m2, m12 + paddw m1, m3 + pshufb m3, m5, m8 + pmaddubsw m3, m12 + pshufb m4, m9 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m9 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m10 + paddw m5, m10 + paddw m0, m2 + vpbroadcastd m2, [pw_2056] + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m2 + paddw m1, m2 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+32], m1 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .hv_main +.hv_extend_left: + movu m4, [lpfq+r10-4] + pshufb m4, [wiener_l_shuf] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10-4] +.hv_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -34 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m11 + pshufb m1, m5, m6 + pmaddubsw m1, m11 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + paddw m0, m2 + pshufb m2, m4, m8 + pmaddubsw m2, m12 + paddw m1, m3 + pshufb m3, m5, m8 + pmaddubsw m3, m12 + pshufb m4, m9 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m9 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m10 + paddw m5, m10 + paddw m0, m2 + paddw m1, m3 + mova m2, [t4+r10*2] + paddw m2, [t2+r10*2] + mova m3, [t3+r10*2] + paddsw m0, m4 + vpbroadcastd m4, [pw_2056] + paddsw m1, m5 + mova m5, [t5+r10*2] + paddw m5, [t1+r10*2] + psraw m0, 3 + psraw m1, 3 + paddw m0, m4 + paddw m1, m4 + paddw m4, m0, [t6+r10*2] + mova [t0+r10*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m5 + pmaddwd m3, m14 + punpckhwd m4, m5 + pmaddwd m4, m14 + paddd m0, m3 + paddd m4, m2 + mova m2, [t4+r10*2+32] + paddw m2, [t2+r10*2+32] + mova m3, [t3+r10*2+32] + mova m5, [t5+r10*2+32] + paddw m5, [t1+r10*2+32] + packuswb m0, m4 + paddw m4, m1, [t6+r10*2+32] + mova [t0+r10*2+32], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m5 + pmaddwd m3, m14 + punpckhwd m4, m5 + pmaddwd m4, m14 + paddd m1, m3 + paddd m2, m4 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m2, [t4+r10*2+ 0] + paddw m2, [t2+r10*2+ 0] + mova m4, [t3+r10*2+ 0] + mova m6, [t1+r10*2+ 0] + paddw m8, m6, [t6+r10*2+ 0] + paddw m6, [t5+r10*2+ 0] + mova m3, [t4+r10*2+32] + paddw m3, [t2+r10*2+32] + mova m5, [t3+r10*2+32] + mova m7, [t1+r10*2+32] + paddw m9, m7, [t6+r10*2+32] + paddw m7, [t5+r10*2+32] + punpcklwd m0, m2, m4 + pmaddwd m0, m15 + punpckhwd m2, m4 + pmaddwd m2, m15 + punpcklwd m4, m8, m6 + pmaddwd m4, m14 + punpckhwd m6, m8, m6 + pmaddwd m6, m14 + punpcklwd m1, m3, m5 + pmaddwd m1, m15 + punpckhwd m3, m5 + pmaddwd m3, m15 + punpcklwd m5, m9, m7 + pmaddwd m5, m14 + punpckhwd m7, m9, m7 + pmaddwd m7, m14 + paddd m0, m4 + paddd m2, m6 + paddd m1, m5 + paddd m3, m7 + packuswb m0, m2 + packuswb m1, m3 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret + +cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt + mov fltq, r6mp + movifnidn hd, hm + mov edged, r7m + mov wd, wm + vbroadcasti128 m6, [wiener_shufB] + vpbroadcastd m12, [fltq+ 2] + vbroadcasti128 m7, [wiener_shufC] + packsswb m12, m12 ; x1 x2 + vpbroadcastw m13, [fltq+ 6] ; x3 + vbroadcasti128 m8, [sgr_shuf+6] + add lpfq, wq + vpbroadcastd m9, [pw_m16380] + vpbroadcastd m10, [pw_2056] + mova m11, [wiener_l_shuf] + vpbroadcastd m14, [fltq+16] ; __ y1 + add dstq, wq + vpbroadcastd m15, [fltq+20] ; y2 y3 + lea t1, [rsp+wq*2+16] + psllw m14, 5 + neg wq + psllw m15, 5 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t3, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.end: + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call .v + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq +.v1: + call .v + jmp .end +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .h_main +.h_extend_left: + vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located + mova m4, [lpfq+r10] ; before the start of the buffer + palignr m4, m5, 12 + pshufb m4, m11 + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+r10-4] +.h_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -33 + jl .h_have_right + call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right +.h_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 + pshufb m1, m5, m6 + pmaddubsw m1, m12 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + pshufb m4, m8 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m8 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m9 + paddw m5, m9 + paddw m0, m2 + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m10 + paddw m1, m10 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+32], m1 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm4, [leftq] + vpblendd m4, [lpfq+r10-4], 0xfe + add leftq, 4 + jmp .hv_main +.hv_extend_left: + movu m4, [lpfq+r10-4] + pshufb m4, m11 + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+r10-4] +.hv_main: + movu m5, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -33 + jl .hv_have_right + call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right +.hv_have_right: + pshufb m0, m4, m6 + pmaddubsw m0, m12 + pshufb m1, m5, m6 + pmaddubsw m1, m12 + pshufb m2, m4, m7 + pmaddubsw m2, m12 + pshufb m3, m5, m7 + pmaddubsw m3, m12 + pshufb m4, m8 + paddw m0, m2 + pmullw m2, m4, m13 + pshufb m5, m8 + paddw m1, m3 + pmullw m3, m5, m13 + psllw m4, 7 + psllw m5, 7 + paddw m4, m9 + paddw m5, m9 + paddw m0, m2 + paddw m1, m3 + mova m2, [t3+r10*2] + paddw m2, [t1+r10*2] + mova m3, [t2+r10*2] + paddsw m0, m4 + paddsw m1, m5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m10 + paddw m1, m10 + paddw m4, m0, [t4+r10*2] + mova [t0+r10*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m4 + pmaddwd m3, m14 + punpckhwd m4, m4 + pmaddwd m4, m14 + paddd m0, m3 + paddd m4, m2 + mova m2, [t3+r10*2+32] + paddw m2, [t1+r10*2+32] + mova m3, [t2+r10*2+32] + packuswb m0, m4 + paddw m4, m1, [t4+r10*2+32] + mova [t0+r10*2+32], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m15 + punpckhwd m2, m3 + pmaddwd m2, m15 + punpcklwd m3, m4, m4 + pmaddwd m3, m14 + punpckhwd m4, m4 + pmaddwd m4, m14 + paddd m1, m3 + paddd m2, m4 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .hv_loop + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + add dstq, strideq + ret +.v: + mov r10, wq + psrld m13, m14, 16 ; y1 __ +.v_loop: + mova m6, [t1+r10*2+ 0] + paddw m2, m6, [t3+r10*2+ 0] + mova m4, [t2+r10*2+ 0] + mova m7, [t1+r10*2+32] + paddw m3, m7, [t3+r10*2+32] + mova m5, [t2+r10*2+32] + paddw m6, [t4+r10*2+ 0] + paddw m7, [t4+r10*2+32] + punpcklwd m0, m2, m4 + pmaddwd m0, m15 + punpckhwd m2, m4 + pmaddwd m2, m15 + punpcklwd m1, m3, m5 + pmaddwd m1, m15 + punpckhwd m3, m5 + pmaddwd m3, m15 + punpcklwd m5, m7, m6 + pmaddwd m4, m5, m14 + punpckhwd m7, m6 + pmaddwd m6, m7, m14 + pmaddwd m5, m13 + pmaddwd m7, m13 + paddd m0, m4 + paddd m2, m6 + paddd m1, m5 + paddd m3, m7 + packuswb m0, m2 + packuswb m1, m3 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+r10], m0 + add r10, 32 + jl .v_loop + ret + +cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \ + w, h, edge, params +%define base r12-sgr_x_by_x_avx2-256*4 + lea r12, [sgr_x_by_x_avx2+256*4] + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti128 m8, [base+sgr_shuf+0] + vbroadcasti128 m9, [base+sgr_shuf+8] + add lpfq, wq + vbroadcasti128 m10, [base+sgr_shuf+2] + add dstq, wq + vbroadcasti128 m11, [base+sgr_shuf+6] + lea t3, [rsp+wq*4+16+400*12] + vpbroadcastd m12, [paramsq+0] ; s0 + pxor m6, m6 + vpbroadcastw m7, [paramsq+8] ; w0 + lea t1, [rsp+wq*2+20] + vpbroadcastd m13, [base+pd_0xf00800a4] + neg wq + vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) + psllw m7, 4 + vpbroadcastd m15, [base+pd_m4096] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + test hd, hd + jz .odd_height + call .h + add lpfq, strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .h_top + add lpfq, strideq + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+400*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.extend_right: + movd xm2, r10d + mova m0, [sgr_r_ext] + vpbroadcastb m2, xm2 + psubb m0, m2 + pminub m0, [pb_0to31] + pshufb m5, m0 + ret +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -18 + jl .h_have_right + call .extend_right +.h_have_right: + pshufb m3, m5, m8 + pmullw m4, m3, m3 + pshufb m2, m5, m9 + paddw m0, m3, m2 + shufps m3, m2, q2121 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + punpcklwd m3, m4, m6 + paddd m1, m3 + punpckhwd m4, m6 + paddd m2, m4 + pshufb m4, m5, m10 + paddw m0, m4 + pshufb m5, m11 + paddw m0, m5 ; sum + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + punpckhwd m4, m5 + pmaddwd m4, m4 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10*2+400*0] + paddd m1, [t1+r10*2+400*2] + paddd m2, [t1+r10*2+400*4] +.h_loop_end: + paddd m1, m3 ; sumsq + paddd m2, m4 + mova [t1+r10*2+400*0], m0 + mova [t1+r10*2+400*2], m1 + mova [t1+r10*2+400*4], m2 + add r10, 16 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-2] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10*2+400*0] + mova m1, [t1+r10*2+400*2] + mova m2, [t1+r10*2+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10*2+400*0], m0 + mova [t2+r10*2+400*2], m1 + mova [t2+r10*2+400*4], m2 + add r10, 16 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv_main +.hv_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu xm5, [lpfq+r10-2] +.hv_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + pshufb m1, m5, m8 + pmullw m4, m1, m1 + pshufb m3, m5, m9 + paddw m0, m1, m3 + shufps m1, m3, q2121 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + punpcklwd m1, m4, m6 + paddd m2, m1 + punpckhwd m4, m6 + paddd m3, m4 + pshufb m1, m5, m10 + paddw m0, m1 + pshufb m5, m11 + paddw m0, m5 ; h sum + punpcklwd m4, m5, m1 + pmaddwd m4, m4 + punpckhwd m5, m1 + pmaddwd m5, m5 + paddw m1, m0, [t1+r10*2+400*0] + paddd m2, m4 ; h sumsq + paddd m3, m5 + paddd m4, m2, [t1+r10*2+400*2] + paddd m5, m3, [t1+r10*2+400*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddw m1, [t2+r10*2+400*0] ; hv sum + paddd m4, [t2+r10*2+400*2] ; hv sumsq + paddd m5, [t2+r10*2+400*4] + mova [t0+r10*2+400*0], m0 + mova [t0+r10*2+400*2], m2 + mova [t0+r10*2+400*4], m3 + vpbroadcastd m2, [pd_25] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m4, m2 ; a * 25 + pmulld m5, m2 + pmaddwd m2, m0, m0 ; b * b + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m12 ; p * s + pmulld m5, m12 + pmaddwd m0, m13 ; b * 164 + pmaddwd m1, m13 + paddusw m4, m13 + paddusw m5, m13 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 ; x + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + pand m0, m15 + pand m1, m15 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires + vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. + mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but + vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. + add r10, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10*2+400*0], m1 + paddw m1, m0 + mova [t1+r10*2+400*2], m4 + paddd m4, m2 + mova [t1+r10*2+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m0, [t1+r10*2+400*0] + mova m2, [t1+r10*2+400*2] + mova m3, [t1+r10*2+400*4] + paddw m1, m0, [t2+r10*2+400*0] + paddd m4, m2, [t2+r10*2+400*2] + paddd m5, m3, [t2+r10*2+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + paddd m5, m3 + vpbroadcastd m2, [pd_25] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmulld m4, m2 ; a * 25 + pmulld m5, m2 + pmaddwd m2, m0, m0 ; b * b + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m12 ; p * s + pmulld m5, m12 + pmaddwd m0, m13 ; b * 164 + pmaddwd m1, m13 + paddusw m4, m13 + paddusw m5, m13 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 ; x + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + pand m0, m15 + pand m1, m15 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+36] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+32] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+40] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 ; ab 565 + paddd m3, m1 + pandn m0, m15, m2 ; a + psrld m2, 12 ; b + pandn m1, m15, m3 + psrld m3, 12 + mova [t3+r10*4+400*4+ 0], m0 + mova [t3+r10*4+400*8+ 0], m2 + mova [t3+r10*4+400*4+32], m1 + mova [t3+r10*4+400*8+32], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+36] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+32] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+40] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 + paddd m3, m1 + pandn m0, m15, m2 + psrld m2, 12 + pandn m1, m15, m3 + psrld m3, 12 + paddd m4, m0, [t3+r10*4+400*4+ 0] ; a + paddd m5, m1, [t3+r10*4+400*4+32] + mova [t3+r10*4+400*4+ 0], m0 + mova [t3+r10*4+400*4+32], m1 + paddd m0, m2, [t3+r10*4+400*8+ 0] ; b + paddd m1, m3, [t3+r10*4+400*8+32] + mova [t3+r10*4+400*8+ 0], m2 + mova [t3+r10*4+400*8+32], m3 + pmovzxbd m2, [dstq+r10+0] + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2 ; a * src + pmaddwd m5, m3 + packssdw m2, m3 + psubd m0, m4 ; b - a * src + (1 << 8) + psubd m1, m5 + psrad m0, 9 + psrad m1, 9 + packssdw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + pmovzxbd m2, [dstq+r10+0] + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src + pmaddwd m5, m3, [t3+r10*4+400*4+32] + mova m0, [t3+r10*4+400*8+ 0] ; b + mova m1, [t3+r10*4+400*8+32] + packssdw m2, m3 + psubd m0, m4 ; b - a * src + (1 << 7) + psubd m1, m5 + psrad m0, 8 + psrad m1, 8 + packssdw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \ + w, h, edge, params +%define base r14-sgr_x_by_x_avx2-256*4 + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + lea r14, [sgr_x_by_x_avx2+256*4] + vbroadcasti128 m8, [base+sgr_shuf+2] + add lpfq, wq + vbroadcasti128 m9, [base+sgr_shuf+4] + add dstq, wq + vbroadcasti128 m10, [base+sgr_shuf+6] + lea t3, [rsp+wq*4+16+400*12] + vpbroadcastd m11, [paramsq+ 4] ; s1 + pxor m6, m6 + vpbroadcastw m7, [paramsq+10] ; w1 + lea t1, [rsp+wq*2+20] + vpbroadcastd m12, [base+pd_0xf00801c7] + neg wq + vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15) + psllw m7, 4 + vpbroadcastd m14, [base+pd_m4096] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + add t1, 400*6 + call .h_top + lea t4, [lpfq+strideq*4] + mov lpfq, dstq + add t4, strideq + mov [rsp], t4 ; below + mov t0, t2 + call .hv +.main: + mov t5, t3 + add t3, 400*4 + dec hd + jz .height1 + add lpfq, strideq + call .hv + call .prep_n + dec hd + jz .extend_bottom +.main_loop: + add lpfq, strideq + call .hv + call .n + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv_bottom + call .n + add lpfq, strideq + call .hv_bottom +.end: + call .n + RET +.height1: + call .v + call .prep_n + mov t2, t1 + call .v + jmp .end +.extend_bottom: + call .v + call .n + mov t2, t1 + call .v + jmp .end +.no_top: + lea t4, [lpfq+strideq*4] + mov lpfq, dstq + lea t4, [t4+strideq*2] + mov [rsp], t4 + call .h + lea t0, [t1+400*6] + mov t2, t1 + call .v + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -17 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.h_have_right: + pshufb m0, m5, m8 + pmullw m2, m0, m0 + pshufb m4, m5, m9 + paddw m0, m4 + pshufb m5, m10 + paddw m0, m5 ; sum + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + punpckhwd m4, m5 + pmaddwd m4, m4 + punpcklwd m1, m2, m6 + punpckhwd m2, m6 + mova [t1+r10*2+400*0], m0 + paddd m1, m3 ; sumsq + paddd m2, m4 + mova [t1+r10*2+400*2], m1 + mova [t1+r10*2+400*4], m2 + add r10, 16 + jl .h_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv_main +.hv_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu xm5, [lpfq+r10-2] +.hv_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -17 + jl .hv_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.hv_have_right: + pshufb m0, m5, m8 + pmullw m3, m0, m0 + pshufb m1, m5, m9 + paddw m0, m1 + pshufb m5, m10 + paddw m0, m5 ; h sum + punpcklwd m4, m5, m1 + pmaddwd m4, m4 + punpckhwd m5, m1 + pmaddwd m5, m5 + paddw m1, m0, [t2+r10*2+400*0] + paddw m1, [t1+r10*2+400*0] ; hv sum + punpcklwd m2, m3, m6 + punpckhwd m3, m6 + paddd m4, m2 ; h sumsq + paddd m5, m3 + paddd m2, m4, [t2+r10*2+400*2] + paddd m3, m5, [t2+r10*2+400*4] + paddd m2, [t1+r10*2+400*2] ; hv sumsq + paddd m3, [t1+r10*2+400*4] + mova [t0+r10*2+400*0], m0 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + mova [t0+r10*2+400*2], m4 + pslld m4, m2, 3 + mova [t0+r10*2+400*4], m5 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + pmaddwd m2, m0, m0 ; b * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m11 ; p * s + pmulld m5, m11 + pmaddwd m0, m12 ; b * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r14+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r14+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m13 + pand m0, m14 + pand m1, m14 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m1, [t1+r10*2+400*0] + paddw m1, m1 + paddw m1, [t2+r10*2+400*0] ; hv sum + mova m2, [t1+r10*2+400*2] + mova m3, [t1+r10*2+400*4] + paddd m2, m2 + paddd m3, m3 + paddd m2, [t2+r10*2+400*2] ; hv sumsq + paddd m3, [t2+r10*2+400*4] + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + pmaddwd m2, m0, m0 ; b * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + pmulld m4, m11 ; p * s + pmulld m5, m11 + pmaddwd m0, m12 ; b * 455 + pmaddwd m1, m12 + paddusw m4, m12 + paddusw m5, m12 + psrad m3, m4, 20 ; min(z, 255) - 256 + vpgatherdd m2, [r14+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r14+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m13 + pand m0, m14 + pand m1, m14 + por m0, m2 ; a | (b << 12) + por m1, m3 + mova [t3+r10*4+ 8], xm0 + vextracti128 [t3+r10*4+40], m0, 1 + mova [t3+r10*4+24], xm1 + vextracti128 [t3+r10*4+56], m1, 1 + add r10, 16 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq + mov t4, t3 + add t3, 400*4 +.prep_n_loop: + mova m2, [t5+r10*4+0] + mova m3, [t4+r10*4+0] + paddd m2, [t5+r10*4+8] + paddd m3, [t4+r10*4+8] + paddd m0, m2, [t5+r10*4+4] + paddd m1, m3, [t4+r10*4+4] + pslld m0, 2 + paddd m1, m1 ; ab[ 0] 222 + psubd m0, m2 ; ab[-1] 343 + mova [t3+r10*4+400*4], m1 + paddd m1, m1 + mova [t5+r10*4], m0 + psubd m1, m3 ; ab[ 0] 343 + mova [t4+r10*4], m1 + add r10, 8 + jl .prep_n_loop + ret +; a+b are packed together in a single dword, but we can't do the +; full neighbor calculations before splitting them since we don't +; have sufficient precision. The solution is to do the calculations +; in two equal halves and split a and b before doing the final sum. +ALIGN function_align +.n: ; neighbor + output + mov r10, wq +.n_loop: + mova m4, [t3+r10*4+ 0] + paddd m4, [t3+r10*4+ 8] + paddd m5, m4, [t3+r10*4+ 4] + paddd m5, m5 ; ab[+1] 222 + mova m2, [t3+r10*4+400*4+ 0] + paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 + mova m3, [t3+r10*4+400*4+32] + paddd m1, m3, [t5+r10*4+32] + mova [t3+r10*4+400*4+ 0], m5 + paddd m5, m5 + psubd m5, m4 ; ab[+1] 343 + mova [t5+r10*4+ 0], m5 + paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 + mova m4, [t3+r10*4+32] + paddd m4, [t3+r10*4+40] + paddd m5, m4, [t3+r10*4+36] + paddd m5, m5 + mova [t3+r10*4+400*4+32], m5 + paddd m5, m5 + psubd m5, m4 + mova [t5+r10*4+32], m5 + pandn m4, m14, m0 + psrld m0, 12 + paddd m3, m5 + pandn m5, m14, m2 + psrld m2, 12 + paddd m4, m5 ; a + pandn m5, m14, m1 + psrld m1, 12 + paddd m0, m2 ; b + (1 << 8) + pandn m2, m14, m3 + psrld m3, 12 + paddd m5, m2 + pmovzxbd m2, [dstq+r10+0] + paddd m1, m3 + pmovzxbd m3, [dstq+r10+8] + pmaddwd m4, m2 ; a * src + pmaddwd m5, m3 + packssdw m2, m3 + psubd m0, m4 ; b - a * src + (1 << 8) + psubd m1, m5 + psrad m0, 9 + psrad m1, 9 + packssdw m0, m1 + pmulhrsw m0, m7 + paddw m0, m2 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, q3120 + mova [dstq+r10], xm0 + add r10, 16 + jl .n_loop + mov r10, t5 + mov t5, t4 + mov t4, r10 + add dstq, strideq + ret + +cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \ + w, h, edge, params +%define base r12-sgr_x_by_x_avx2-256*4 + lea r12, [sgr_x_by_x_avx2+256*4] + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti128 m9, [base+sgr_shuf+0] + vbroadcasti128 m10, [base+sgr_shuf+8] + add lpfq, wq + vbroadcasti128 m11, [base+sgr_shuf+2] + vbroadcasti128 m12, [base+sgr_shuf+6] + add dstq, wq + vpbroadcastd m15, [paramsq+8] ; w0 w1 + lea t3, [rsp+wq*4+400*24+8] + vpbroadcastd m13, [paramsq+0] ; s0 + pxor m7, m7 + vpbroadcastd m14, [paramsq+4] ; s1 + lea t1, [rsp+wq*2+12] + neg wq + psllw m15, 2 ; to reuse existing pd_m4096 register for rounding + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup + add t1, 400*12 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+400*12] + lea r10, [wq-2] +.top_fixup_loop: + mova m0, [t1+r10*2+400* 0] + mova m1, [t1+r10*2+400* 2] + mova m2, [t1+r10*2+400* 4] + paddw m0, m0 + mova m3, [t1+r10*2+400* 6] + paddd m1, m1 + mova m4, [t1+r10*2+400* 8] + paddd m2, m2 + mova m5, [t1+r10*2+400*10] + mova [t2+r10*2+400* 0], m0 + mova [t2+r10*2+400* 2], m1 + mova [t2+r10*2+400* 4], m2 + mova [t2+r10*2+400* 6], m3 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + add r10, 16 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsums + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .h_main +.h_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu xm5, [lpfq+r10-2] +.h_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -18 + jl .h_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.h_have_right: + pshufb m6, m5, m9 + pshufb m4, m5, m10 + paddw m8, m6, m4 + shufps m0, m6, m4, q2121 + pmullw m3, m0, m0 + pshufb m2, m5, m11 + paddw m0, m2 + pshufb m5, m12 + paddw m0, m5 ; sum3 + punpcklwd m1, m2, m5 + pmaddwd m1, m1 + punpckhwd m2, m5 + pmaddwd m2, m2 + punpcklwd m5, m6, m4 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + punpcklwd m4, m3, m7 + paddd m1, m4 ; sumsq3 + punpckhwd m3, m7 + paddd m2, m3 + mova [t1+r10*2+400* 6], m0 + mova [t1+r10*2+400* 8], m1 + mova [t1+r10*2+400*10], m2 + paddw m8, m0 ; sum5 + paddd m5, m1 ; sumsq5 + paddd m6, m2 + mova [t1+r10*2+400* 0], m8 + mova [t1+r10*2+400* 2], m5 + mova [t1+r10*2+400* 4], m6 + add r10, 16 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv0_main +.hv0_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu xm5, [lpfq+r10-2] +.hv0_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -18 + jl .hv0_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.hv0_have_right: + pshufb m6, m5, m9 + pshufb m4, m5, m10 + paddw m8, m6, m4 + shufps m1, m6, m4, q2121 + pmullw m0, m1, m1 + pshufb m3, m5, m11 + paddw m1, m3 + pshufb m5, m12 + paddw m1, m5 ; sum3 + punpcklwd m2, m3, m5 + pmaddwd m2, m2 + punpckhwd m3, m5 + pmaddwd m3, m3 + punpcklwd m5, m6, m4 + pmaddwd m5, m5 + punpckhwd m6, m4 + pmaddwd m6, m6 + punpcklwd m4, m0, m7 + paddd m2, m4 ; sumsq3 + punpckhwd m0, m7 + paddd m3, m0 + paddw m8, m1 ; sum5 + paddd m5, m2 ; sumsq5 + paddd m6, m3 + mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row + mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd + mova [t3+r10*4+400*0+40], m6 + paddw m8, [t1+r10*2+400* 0] + paddd m5, [t1+r10*2+400* 2] + paddd m6, [t1+r10*2+400* 4] + mova [t1+r10*2+400* 0], m8 + mova [t1+r10*2+400* 2], m5 + mova [t1+r10*2+400* 4], m6 + paddw m0, m1, [t1+r10*2+400* 6] + paddd m4, m2, [t1+r10*2+400* 8] + paddd m5, m3, [t1+r10*2+400*10] + mova [t1+r10*2+400* 6], m1 + mova [t1+r10*2+400* 8], m2 + mova [t1+r10*2+400*10], m3 + paddw m1, m0, [t2+r10*2+400* 6] + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m0 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + vpbroadcastd m4, [base+pd_34816] + pmulld m0, m2 + vpbroadcastd m5, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m4 + pand m0, m5 + pand m1, m5 + por m0, m2 ; a3 | (b3 << 12) + por m1, m3 + mova [t3+r10*4+400*4+ 8], xm0 + vextracti128 [t3+r10*4+400*4+40], m0, 1 + mova [t3+r10*4+400*4+24], xm1 + vextracti128 [t3+r10*4+400*4+56], m1, 1 + add r10, 16 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + vpbroadcastd xm0, [leftq] + mova xm5, [lpfq+wq] + palignr xm5, xm0, 12 + add leftq, 4 + jmp .hv1_main +.hv1_extend_left: + mova xm5, [lpfq+wq] + pshufb xm5, [base+sgr_l_shuf] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu xm5, [lpfq+r10-2] +.hv1_main: + vinserti128 m5, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -18 + jl .hv1_have_right + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right +.hv1_have_right: + pshufb m6, m5, m9 + pshufb m3, m5, m10 + paddw m8, m6, m3 + shufps m2, m6, m3, q2121 + pmullw m1, m2, m2 + pshufb m0, m5, m11 + paddw m2, m0 + pshufb m5, m12 + paddw m2, m5 ; sum3 + punpcklwd m4, m5, m0 + pmaddwd m4, m4 + punpckhwd m5, m0 + pmaddwd m5, m5 + punpcklwd m0, m6, m3 + pmaddwd m0, m0 + punpckhwd m6, m3 + pmaddwd m6, m6 + punpcklwd m3, m1, m7 + paddd m4, m3 ; sumsq3 + punpckhwd m1, m7 + paddd m5, m1 + paddw m1, m2, [t2+r10*2+400* 6] + mova [t2+r10*2+400* 6], m2 + paddw m8, m2 ; sum5 + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + paddd m4, m0 ; sumsq5 + paddd m5, m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m6, m2, 3 + pslld m7, m3, 3 + paddd m6, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m7, m3 + pmaddwd m3, m1, m1 + psubd m6, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m7, m3 + pmulld m6, m14 ; p3 * s1 + pmulld m7, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m6, m2 + paddusw m7, m2 + psrad m3, m6, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m6 + psrad m6, m7, 20 + vpgatherdd m3, [r12+m6*4], m7 + vpbroadcastd m6, [base+pd_34816] ; x3 + pmulld m0, m2 + vpbroadcastd m7, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m6 + pand m0, m7 + pand m7, m1 + por m0, m2 ; a3 | (b3 << 12) + por m7, m3 + paddw m1, m8, [t2+r10*2+400*0] + paddd m2, m4, [t2+r10*2+400*2] + paddd m3, m5, [t2+r10*2+400*4] + paddw m1, [t1+r10*2+400*0] + paddd m2, [t1+r10*2+400*2] + paddd m3, [t1+r10*2+400*4] + mova [t2+r10*2+400*0], m8 + mova [t2+r10*2+400*2], m4 + mova [t2+r10*2+400*4], m5 + mova [t3+r10*4+400*8+ 8], xm0 + vextracti128 [t3+r10*4+400*8+40], m0, 1 + mova [t3+r10*4+400*8+24], xm7 + vextracti128 [t3+r10*4+400*8+56], m7, 1 + vpbroadcastd m4, [base+pd_25] + pxor m7, m7 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmulld m2, m4 ; a5 * 25 + pmulld m3, m4 + pmaddwd m4, m0, m0 ; b5 * b5 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p5 + vpbroadcastd m4, [base+pd_0xf00800a4] + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r12+m5*4], m2 ; x5 + psrad m2, m3, 20 + vpgatherdd m5, [r12+m2*4], m3 + pmulld m0, m4 + pmulld m1, m5 + paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m6 + vpbroadcastd m6, [base+pd_m4096] + pand m0, m6 + pand m1, m6 + por m0, m4 ; a5 | (b5 << 12) + por m1, m5 + mova [t3+r10*4+400*0+ 8], xm0 + vextracti128 [t3+r10*4+400*0+40], m0, 1 + mova [t3+r10*4+400*0+24], xm1 + vextracti128 [t3+r10*4+400*0+56], m1, 1 + add r10, 16 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-2] + vpbroadcastd m6, [base+pd_34816] + vpbroadcastd m8, [base+pd_m4096] +.v0_loop: + mova m0, [t1+r10*2+400* 6] + mova m4, [t1+r10*2+400* 8] + mova m5, [t1+r10*2+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+r10*2+400* 6] + paddd m2, m4, [t2+r10*2+400* 8] + paddd m3, m5, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m0 + mova [t2+r10*2+400* 8], m4 + mova [t2+r10*2+400*10], m5 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 ; x3 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + pmulld m0, m2 + pmulld m1, m3 + paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m6 + pand m0, m8 + pand m1, m8 + por m0, m2 ; a3 | (b3 << 12) + por m1, m3 + mova m2, [t1+r10*2+400*0] + mova m3, [t1+r10*2+400*2] + mova m4, [t1+r10*2+400*4] + mova [t3+r10*4+400*8+ 8], m2 + mova [t3+r10*4+400*0+ 8], m3 + mova [t3+r10*4+400*0+40], m4 + paddw m2, m2 ; cc5 + paddd m3, m3 + paddd m4, m4 + mova [t1+r10*2+400*0], m2 + mova [t1+r10*2+400*2], m3 + mova [t1+r10*2+400*4], m4 + mova [t3+r10*4+400*4+ 8], xm0 + vextracti128 [t3+r10*4+400*4+40], m0, 1 + mova [t3+r10*4+400*4+24], xm1 + vextracti128 [t3+r10*4+400*4+56], m1, 1 + add r10, 16 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-2] +.v1_loop: + mova m4, [t1+r10*2+400* 6] + mova m5, [t1+r10*2+400* 8] + mova m6, [t1+r10*2+400*10] + paddw m1, m4, [t2+r10*2+400* 6] + paddd m2, m5, [t2+r10*2+400* 8] + paddd m3, m6, [t2+r10*2+400*10] + mova [t2+r10*2+400* 6], m4 + mova [t2+r10*2+400* 8], m5 + mova [t2+r10*2+400*10], m6 + punpcklwd m0, m1, m7 ; b3 + punpckhwd m1, m7 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + pmaddwd m2, m0, m0 ; b3 * b3 + paddd m5, m3 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + vpbroadcastd m2, [base+pd_0xf00801c7] + psubd m5, m3 + pmulld m4, m14 ; p3 * s1 + pmulld m5, m14 + pmaddwd m0, m2 ; b3 * 455 + pmaddwd m1, m2 + paddusw m4, m2 + paddusw m5, m2 + psrad m3, m4, 20 ; min(z3, 255) - 256 + vpgatherdd m2, [r12+m3*4], m4 ; x3 + psrad m4, m5, 20 + vpgatherdd m3, [r12+m4*4], m5 + vpbroadcastd m4, [base+pd_34816] + pmulld m0, m2 + vpbroadcastd m8, [base+pd_m4096] + pmulld m1, m3 + paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m4 + pand m0, m8 + pand m8, m1 + por m0, m2 ; a3 | (b3 << 12) + por m8, m3 + mova m4, [t3+r10*4+400*8+ 8] + mova m5, [t3+r10*4+400*0+ 8] + mova m6, [t3+r10*4+400*0+40] + paddw m1, m4, [t2+r10*2+400*0] + paddd m2, m5, [t2+r10*2+400*2] + paddd m3, m6, [t2+r10*2+400*4] + paddw m1, [t1+r10*2+400*0] + paddd m2, [t1+r10*2+400*2] + paddd m3, [t1+r10*2+400*4] + mova [t2+r10*2+400*0], m4 + mova [t2+r10*2+400*2], m5 + mova [t2+r10*2+400*4], m6 + vpbroadcastd m4, [base+pd_25] + mova [t3+r10*4+400*8+ 8], xm0 + vextracti128 [t3+r10*4+400*8+40], m0, 1 + mova [t3+r10*4+400*8+24], xm8 + vextracti128 [t3+r10*4+400*8+56], m8, 1 + punpcklwd m0, m1, m7 ; b5 + punpckhwd m1, m7 + pmulld m2, m4 ; a5 * 25 + pmulld m3, m4 + pmaddwd m4, m0, m0 ; b5 * b5 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p5 + vpbroadcastd m4, [base+pd_0xf00800a4] + psubd m3, m5 + pmulld m2, m13 ; p5 * s0 + pmulld m3, m13 + pmaddwd m0, m4 ; b5 * 164 + pmaddwd m1, m4 + paddusw m2, m4 + paddusw m3, m4 + psrad m5, m2, 20 ; min(z5, 255) - 256 + vpgatherdd m4, [r12+m5*4], m2 ; x5 + psrad m2, m3, 20 + vpgatherdd m5, [r12+m2*4], m3 + pmulld m0, m4 + vpbroadcastd m6, [base+pd_34816] + pmulld m1, m5 + paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m6 + vpbroadcastd m6, [base+pd_m4096] + pand m0, m6 + pand m1, m6 + por m0, m4 ; a5 | (b5 << 12) + por m1, m5 + mova [t3+r10*4+400*0+ 8], xm0 + vextracti128 [t3+r10*4+400*0+40], m0, 1 + mova [t3+r10*4+400*0+24], xm1 + vextracti128 [t3+r10*4+400*0+56], m1, 1 + add r10, 16 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+400*0+4] + paddd m1, m0, [t3+r10*4+400*0+0] + mova m4, [t3+r10*4+400*4+0] + paddd m1, [t3+r10*4+400*0+8] + mova m5, [t3+r10*4+400*8+0] + paddd m4, [t3+r10*4+400*4+8] + paddd m5, [t3+r10*4+400*8+8] + paddd m2, m4, [t3+r10*4+400*4+4] + paddd m3, m5, [t3+r10*4+400*8+4] + paddd m0, m1 + pslld m1, 2 + pslld m2, 2 + paddd m1, m0 ; ab5 565 + paddd m3, m3 ; ab3[ 0] 222 + psubd m2, m4 ; ab3[-1] 343 + mova [t3+r10*4+400*20], m3 + pandn m0, m6, m1 ; a5 565 + mova [t3+r10*4+400*24], m2 + psrld m1, 12 ; b5 565 + mova [t3+r10*4+400*12], m0 + paddd m3, m3 + mova [t3+r10*4+400*16], m1 + psubd m3, m5 ; ab3[ 0] 343 + mova [t3+r10*4+400*28], m3 + add r10, 8 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m0, [t3+r10*4+4] + paddd m4, m0, [t3+r10*4+0] + paddd m4, [t3+r10*4+8] + paddd m0, m4 + pslld m4, 2 + paddd m4, m0 + pandn m0, m6, m4 + psrld m4, 12 + paddd m2, m0, [t3+r10*4+400*12] ; a5 + mova [t3+r10*4+400*12], m0 + paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) + mova [t3+r10*4+400*16], m4 + mova m3, [t3+r10*4+400*4+0] + paddd m3, [t3+r10*4+400*4+8] + paddd m5, m3, [t3+r10*4+400*4+4] + paddd m5, m5 ; ab3[ 1] 222 + mova m4, [t3+r10*4+400*20] + paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+400*20], m5 + paddd m5, m5 + psubd m5, m3 ; ab3[ 1] 343 + mova [t3+r10*4+400*24], m5 + paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 + pandn m3, m6, m1 + psrld m1, 12 + pandn m5, m6, m4 + psrld m4, 12 + paddd m3, m5 ; a3 + paddd m1, m4 ; b3 + (1 << 8) + pmovzxbd m4, [dstq+r10] + pmaddwd m2, m4 ; a5 * src + pmaddwd m3, m4 ; a3 * src + psubd m0, m2 ; b5 - a5 * src + (1 << 8) + psubd m1, m3 ; b3 - a3 * src + (1 << 8) + psrld m0, 9 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + psubd m0, m6 + psrad m0, 13 + paddd m0, m4 + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 + packuswb xm0, xm0 + movq [dstq+r10], xm0 + add r10, 8 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m3, [t3+r10*4+400*8+0] + paddd m3, [t3+r10*4+400*8+8] + paddd m5, m3, [t3+r10*4+400*8+4] + paddd m5, m5 ; ab3[ 1] 222 + mova m4, [t3+r10*4+400*20] + paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+400*20], m5 + paddd m5, m5 + psubd m5, m3 ; ab3[ 1] 343 + mova [t3+r10*4+400*28], m5 + paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 + pandn m3, m6, m1 + psrld m1, 12 + pandn m5, m6, m4 + psrld m4, 12 + paddd m3, m5 ; -a3 + paddd m1, m4 ; b3 + (1 << 8) + pmovzxbd m4, [dstq+r10] + pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src + mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) + pmaddwd m3, m4 ; -a3 * src + psubd m0, m2 ; a5 * src + b5 + (1 << 7) + psubd m1, m3 ; a3 * src + b3 + (1 << 8) + psrld m0, 8 + pslld m1, 7 + pblendw m0, m1, 0xaa + pmaddwd m0, m15 + psubd m0, m6 + psrad m0, 13 + paddd m0, m4 + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 + packuswb xm0, xm0 + movq [dstq+r10], xm0 + add r10, 8 + jl .n1_loop + add dstq, strideq + ret + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/looprestoration_avx512.asm b/third_party/dav1d/src/x86/looprestoration_avx512.asm new file mode 100644 index 0000000000..1e571774ca --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration_avx512.asm @@ -0,0 +1,2122 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +wiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12 +wiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13 +wiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 +wiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +wiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47 + db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63 +sgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8 + db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128 +sgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 +r_ext_mask: times 68 db -1 + times 4 db 0 +wiener_x_shuf: db 0, 2, -1, 0 +wiener_x_add: db 0, 1,127, 0 + +pw_61448: times 2 dw 61448 +pw_164_455: dw 164, 455 +pd_m16380: dd -16380 +pd_m4096: dd -4096 +pd_m25 dd -25 +pd_m9: dd -9 +pd_34816: dd 34816 +pd_8421376: dd 8421376 + +cextern sgr_x_by_x + +SECTION .text + +DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers + +INIT_ZMM avx512icl +cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt + mov fltq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti32x4 m6, [wiener_shufA] + vbroadcasti32x4 m7, [wiener_shufB] + mov r10d, 0xfffe + vbroadcasti32x4 m8, [wiener_shufC] + vbroadcasti32x4 m9, [wiener_shufD] + kmovw k1, r10d + vpbroadcastd m0, [wiener_x_shuf] + vpbroadcastd m1, [wiener_x_add] + mov r10, 0xaaaaaaaaaaaaaaaa + vpbroadcastd m11, [fltq+ 0] + vpbroadcastd m12, [fltq+ 4] + kmovq k2, r10 + vpbroadcastd m10, [pd_m16380] + packsswb m11, m11 ; x0 x1 x0 x1 + vpbroadcastd m14, [fltq+16] + pshufb m12, m0 + vpbroadcastd m15, [fltq+20] + paddb m12, m1 ; x2 x3+1 x2 127 + vpbroadcastd m13, [pd_8421376] + psllw m14, 5 ; y0 y1 + psllw m15, 5 ; y2 y3 + cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32 + jle .w32 ; pixels, so we need a special case for small widths + lea t1, [rsp+wq*2+16] + add lpfq, wq + add dstq, wq + neg wq + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 384*2 + add r10, strideq + mov [rsp], r10 ; below + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call .v + RET +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call .v +.v2: + call .v + jmp .v1 +.h: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm16, [leftq] + vmovdqu32 m16{k1}, [lpfq+r10-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception + vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory + jmp .h_main +.h_top: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m16, [lpfq+r10-4] +.h_main: + movu m17, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -66 + jl .h_have_right + push r0 + lea r0, [r_ext_mask+65] + vpbroadcastb m0, [lpfq-1] + vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b + vpternlogd m17, m0, [r0+r10+8], 0xe4 + pop r0 +.h_have_right: + pshufb m4, m16, m6 + mova m0, m10 + vpdpbusd m0, m4, m11 + pshufb m4, m16, m7 + mova m2, m10 + vpdpbusd m2, m4, m11 + pshufb m4, m17, m6 + mova m1, m10 + vpdpbusd m1, m4, m11 + pshufb m4, m17, m7 + mova m3, m10 + vpdpbusd m3, m4, m11 + pshufb m4, m16, m8 + vpdpbusd m0, m4, m12 + pshufb m16, m9 + vpdpbusd m2, m16, m12 + pshufb m4, m17, m8 + vpdpbusd m1, m4, m12 + pshufb m17, m9 + vpdpbusd m3, m17, m12 + packssdw m0, m2 + packssdw m1, m3 + psraw m0, 3 + psraw m1, 3 + mova [t1+r10*2+ 0], m0 + mova [t1+r10*2+64], m1 + add r10, 64 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm16, [leftq] + vmovdqu32 m16{k1}, [lpfq+r10-4] + add leftq, 4 + jmp .hv_main +.hv_extend_left: + vpbroadcastb xm16, [lpfq+r10] + vmovdqu32 m16{k1}, [lpfq+r10-4] + jmp .hv_main +.hv_bottom: + mov r10, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m16, [lpfq+r10-4] +.hv_main: + movu m17, [lpfq+r10+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -66 + jl .hv_have_right + push r0 + lea r0, [r_ext_mask+65] + vpbroadcastb m0, [lpfq-1] + vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b + vpternlogd m17, m0, [r0+r10+8], 0xe4 + pop r0 +.hv_have_right: + pshufb m4, m16, m6 + mova m0, m10 + vpdpbusd m0, m4, m11 + pshufb m4, m16, m7 + mova m2, m10 + vpdpbusd m2, m4, m11 + pshufb m4, m17, m6 + mova m1, m10 + vpdpbusd m1, m4, m11 + pshufb m4, m17, m7 + mova m3, m10 + vpdpbusd m3, m4, m11 + pshufb m4, m16, m8 + vpdpbusd m0, m4, m12 + pshufb m16, m9 + vpdpbusd m2, m16, m12 + pshufb m4, m17, m8 + vpdpbusd m1, m4, m12 + pshufb m17, m9 + vpdpbusd m3, m17, m12 + packssdw m0, m2 + packssdw m1, m3 + psraw m0, 3 + psraw m1, 3 + mova m16, [t4+r10*2] + paddw m16, [t2+r10*2] + mova m3, [t3+r10*2] + mova m17, [t4+r10*2+64] + paddw m17, [t2+r10*2+64] + mova m5, [t3+r10*2+64] + punpcklwd m4, m16, m3 + mova m2, m13 + vpdpwssd m2, m4, m15 + punpcklwd m18, m17, m5 + mova m4, m13 + vpdpwssd m4, m18, m15 + punpckhwd m16, m3 + mova m3, m13 + vpdpwssd m3, m16, m15 + punpckhwd m17, m5 + mova m5, m13 + vpdpwssd m5, m17, m15 + mova m17, [t5+r10*2] + paddw m17, [t1+r10*2] + paddw m16, m0, [t6+r10*2] + mova m19, [t5+r10*2+64] + paddw m19, [t1+r10*2+64] + paddw m18, m1, [t6+r10*2+64] + mova [t0+r10*2+ 0], m0 + mova [t0+r10*2+64], m1 + punpcklwd m0, m16, m17 + vpdpwssd m2, m0, m14 + punpcklwd m1, m18, m19 + vpdpwssd m4, m1, m14 + punpckhwd m16, m17 + vpdpwssd m3, m16, m14 + punpckhwd m18, m19 + vpdpwssd m5, m18, m14 + packuswb m2, m4 + psrlw m2, 8 + vpackuswb m2{k2}, m3, m5 + movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap + add r10, 64 ; function is used for chroma as well, and in some + jl .hv_loop ; esoteric edge cases chroma dst pointers may only + mov t6, t5 ; have a 32-byte alignment despite having a width + mov t5, t4 ; larger than 32, so use an unaligned store here. + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.v: + mov r10, wq +.v_loop: + mova m4, [t4+r10*2+ 0] + paddw m4, [t2+r10*2+ 0] + mova m1, [t3+r10*2+ 0] + mova m5, [t4+r10*2+64] + paddw m5, [t2+r10*2+64] + mova m3, [t3+r10*2+64] + punpcklwd m6, m4, m1 + mova m0, m13 + vpdpwssd m0, m6, m15 + punpcklwd m6, m5, m3 + mova m2, m13 + vpdpwssd m2, m6, m15 + punpckhwd m4, m1 + mova m1, m13 + vpdpwssd m1, m4, m15 + punpckhwd m5, m3 + mova m3, m13 + vpdpwssd m3, m5, m15 + mova m5, [t1+r10*2+ 0] + paddw m4, m5, [t6+r10*2+ 0] + paddw m5, [t5+r10*2+ 0] + mova m7, [t1+r10*2+64] + paddw m6, m7, [t6+r10*2+64] + paddw m7, [t5+r10*2+64] + punpcklwd m8, m4, m5 + vpdpwssd m0, m8, m14 + punpcklwd m8, m6, m7 + vpdpwssd m2, m8, m14 + punpckhwd m4, m5 + vpdpwssd m1, m4, m14 + punpckhwd m6, m7 + vpdpwssd m3, m6, m14 + packuswb m0, m2 + psrlw m0, 8 + vpackuswb m0{k2}, m1, m3 + movu [dstq+r10], m0 + add r10, 64 + jl .v_loop + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret +.w32: + lea r10, [r_ext_mask+73] + mova ym18, [wiener_perm32] + lea t1, [rsp+16] + sub r10, wq + test edgeb, 4 ; LR_HAVE_TOP + jz .w32_no_top + call .w32_h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 32*2 + call .w32_h_top + lea r9, [lpfq+strideq*4] + mov lpfq, dstq + mov t4, t1 + add t1, 32*2 + add r9, strideq + mov [rsp], r9 ; below + call .w32_h + mov t3, t1 + mov t2, t1 + dec hd + jz .w32_v1 + add lpfq, strideq + add t1, 32*2 + call .w32_h + mov t2, t1 + dec hd + jz .w32_v2 + add lpfq, strideq + add t1, 32*2 + call .w32_h + dec hd + jz .w32_v3 +.w32_main: + lea t0, [t1+32*2] +.w32_main_loop: + call .w32_hv + dec hd + jnz .w32_main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .w32_v3 + mov lpfq, [rsp] + call .w32_hv_bottom + add lpfq, strideq + call .w32_hv_bottom +.w32_v1: + call .w32_v + RET +.w32_no_top: + lea r9, [lpfq+strideq*4] + mov lpfq, dstq + lea r9, [r9+strideq*2] + mov [rsp], r9 + call .w32_h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .w32_v1 + add lpfq, strideq + add t1, 32*2 + call .w32_h + mov t2, t1 + dec hd + jz .w32_v2 + add lpfq, strideq + add t1, 32*2 + call .w32_h + dec hd + jz .w32_v3 + lea t0, [t1+32*2] + call .w32_hv + dec hd + jz .w32_v3 + add t0, 32*8 + call .w32_hv + dec hd + jnz .w32_main +.w32_v3: + call .w32_v +.w32_v2: + call .w32_v + jmp .w32_v1 +.w32_h: + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_h_extend_left + movd xm16, [leftq] + vmovdqu32 ym16{k1}, [lpfq-4] + add leftq, 4 + jmp .w32_h_main +.w32_h_extend_left: + vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception + vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory + jmp .w32_h_main +.w32_h_top: + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_h_extend_left + movu ym16, [lpfq-4] +.w32_h_main: + vinserti32x8 m16, [lpfq+4], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .w32_h_have_right + vpbroadcastb m0, [lpfq+wq-1] + movu ym17, [r10-8] + vinserti32x8 m17, [r10+0], 1 + vpternlogd m16, m0, m17, 0xe4 ; c ? a : b +.w32_h_have_right: + pshufb m2, m16, m6 + mova m0, m10 + vpdpbusd m0, m2, m11 + pshufb m2, m16, m7 + mova m1, m10 + vpdpbusd m1, m2, m11 + pshufb m2, m16, m8 + vpdpbusd m0, m2, m12 + pshufb m16, m9 + vpdpbusd m1, m16, m12 + packssdw m0, m1 + psraw m0, 3 + mova [t1], m0 + ret +.w32_hv: + add lpfq, strideq + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_hv_extend_left + movd xm16, [leftq] + vmovdqu32 ym16{k1}, [lpfq-4] + add leftq, 4 + jmp .w32_hv_main +.w32_hv_extend_left: + vpbroadcastb xm16, [lpfq] + vmovdqu32 ym16{k1}, [lpfq-4] + jmp .w32_hv_main +.w32_hv_bottom: + test edgeb, 1 ; LR_HAVE_LEFT + jz .w32_hv_extend_left + movu ym16, [lpfq-4] +.w32_hv_main: + vinserti32x8 m16, [lpfq+4], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .w32_hv_have_right + vpbroadcastb m0, [lpfq+wq-1] + movu ym17, [r10-8] + vinserti32x8 m17, [r10+0], 1 + vpternlogd m16, m0, m17, 0xe4 +.w32_hv_have_right: + mova m3, [t4] + paddw m3, [t2] + mova m2, [t3] + pshufb m4, m16, m6 + mova m0, m10 + vpdpbusd m0, m4, m11 + pshufb m4, m16, m7 + mova m5, m10 + vpdpbusd m5, m4, m11 + punpcklwd m4, m3, m2 + mova m1, m13 + vpdpwssd m1, m4, m15 + punpckhwd m3, m2 + mova m2, m13 + vpdpwssd m2, m3, m15 + pshufb m4, m16, m8 + vpdpbusd m0, m4, m12 + pshufb m16, m9 + vpdpbusd m5, m16, m12 + packssdw m0, m5 + psraw m0, 3 + mova m4, [t5] + paddw m4, [t1] + paddw m3, m0, [t6] + mova [t0], m0 + punpcklwd m0, m3, m4 + vpdpwssd m1, m0, m14 + punpckhwd m3, m4 + vpdpwssd m2, m3, m14 + packuswb m1, m2 + vpermb m16, m18, m1 + mova [dstq], ym16 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 + add dstq, strideq + ret +.w32_v: + mova m2, [t4] + paddw m2, [t2] + mova m1, [t3] + mova m4, [t1] + paddw m3, m4, [t6] + paddw m4, [t5] + punpcklwd m5, m2, m1 + mova m0, m13 + vpdpwssd m0, m5, m15 + punpckhwd m2, m1 + mova m1, m13 + vpdpwssd m1, m2, m15 + punpcklwd m2, m3, m4 + vpdpwssd m0, m2, m14 + punpckhwd m3, m4 + vpdpwssd m1, m3, m14 + packuswb m0, m1 + vpermb m16, m18, m0 + mova [dstq], ym16 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + add dstq, strideq + ret + +cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \ + w, h, edge, params + mov paramsq, r6mp + mov wd, wm + mov hd, hm + mov edged, r7m + vbroadcasti32x4 m5, [sgr_shuf+1] + add lpfq, wq + vbroadcasti32x4 m6, [sgr_shuf+9] + add dstq, wq + vbroadcasti32x4 m7, [sgr_shuf+3] + lea t3, [rsp+wq*4+16+416*12] + vbroadcasti32x4 m8, [sgr_shuf+7] + pxor m4, m4 + vpbroadcastd m9, [pd_m25] + vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 + vpbroadcastw m15, [paramsq+8] ; w0 + lea t1, [rsp+wq*2+20] + vpbroadcastd m10, [pw_164_455] + neg wq + vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) + mov r10d, 0xfe + vpbroadcastd m13, [pd_m4096] + kmovb k1, r10d + vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) + mov r10, 0x3333333333333333 + mova m18, [sgr_x_by_x+64*0] + kmovq k2, r10 + mova m19, [sgr_x_by_x+64*1] + lea r12, [r_ext_mask+75] + mova m20, [sgr_x_by_x+64*2] + psllw m15, 4 + mova m21, [sgr_x_by_x+64*3] + lea r10, [lpfq+strideq*4] + mova ym22, [sgr_shuf] + add r10, strideq + mov [rsp], r10 ; below + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call .top_fixup + add t1, 416*6 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, strideq + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + test hd, hd + jz .odd_height + call .h + add lpfq, strideq + call .hv + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .h_top + add lpfq, strideq + call .hv_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + call .n0 + call .n1 +.odd_height_end: + call .v + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+416*6] + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + jmp .main +.no_top_height1: + call .v + call .prep_n + jmp .odd_height_end +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu ym17, [lpfq+r10-2] +.h_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.h_have_right: + pshufb m3, m17, m5 + pmullw m2, m3, m3 + pshufb m1, m17, m6 + paddw m0, m3, m1 + shufps m3, m1, q2121 + paddw m0, m3 + punpcklwd m16, m3, m1 + punpckhwd m3, m1 + punpcklwd m1, m2, m4 + vpdpwssd m1, m16, m16 + punpckhwd m2, m4 + vpdpwssd m2, m3, m3 + pshufb m16, m17, m7 + paddw m0, m16 + pshufb m17, m8 + paddw m0, m17 ; sum + punpcklwd m3, m16, m17 + vpdpwssd m1, m3, m3 ; sumsq + punpckhwd m16, m17 + vpdpwssd m2, m16, m16 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+r10*2+416*0] + paddd m1, [t1+r10*2+416*2] + paddd m2, [t1+r10*2+416*4] +.h_loop_end: + mova [t1+r10*2+416*0], m0 + mova [t1+r10*2+416*2], m1 + mova [t1+r10*2+416*4], m2 + add r10, 32 + jl .h_loop + ret +.top_fixup: + lea r10, [wq-2] +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+r10*2+416*0] + mova m1, [t1+r10*2+416*2] + mova m2, [t1+r10*2+416*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+r10*2+416*0], m0 + mova [t2+r10*2+416*2], m1 + mova [t2+r10*2+416*4], m2 + add r10, 32 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv_main +.hv_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu ym17, [lpfq+r10-2] +.hv_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -34 + jl .hv_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv_have_right: + pshufb m1, m17, m5 + pmullw m3, m1, m1 + pshufb m2, m17, m6 + paddw m0, m1, m2 + shufps m1, m2, q2121 + paddw m0, m1 + punpcklwd m16, m1, m2 + punpckhwd m1, m2 + punpcklwd m2, m3, m4 + vpdpwssd m2, m16, m16 + punpckhwd m3, m4 + vpdpwssd m3, m1, m1 + pshufb m16, m17, m7 + paddw m0, m16 + pshufb m17, m8 + paddw m0, m17 ; h sum + punpcklwd m1, m16, m17 + vpdpwssd m2, m1, m1 ; h sumsq + punpckhwd m16, m17 + vpdpwssd m3, m16, m16 + paddw m1, m0, [t1+r10*2+416*0] + paddd m16, m2, [t1+r10*2+416*2] + paddd m17, m3, [t1+r10*2+416*4] + test hd, hd + jz .hv_last_row +.hv_main2: + paddd m16, [t2+r10*2+416*2] ; hv sumsq + paddd m17, [t2+r10*2+416*4] + paddw m1, [t2+r10*2+416*0] ; hv sum + mova [t0+r10*2+416*2], m2 + mova [t0+r10*2+416*4], m3 + mova [t0+r10*2+416*0], m0 + pmulld m16, m9 ; -a * 25 + pmulld m17, m9 + punpcklwd m0, m1, m4 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 164 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires + mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b. + vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but + mova [t3+r10*4+ 72], m17 ; that gets us most of the way. + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+r10*2+416*0], m1 + paddw m1, m0 + mova [t1+r10*2+416*2], m16 + paddd m16, m2 + mova [t1+r10*2+416*4], m17 + paddd m17, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m2, [t1+r10*2+416*2] + paddd m16, m2, [t2+r10*2+416*2] + mova m3, [t1+r10*2+416*4] + paddd m17, m3, [t2+r10*2+416*4] + paddd m2, m2 + paddd m3, m3 + paddd m16, m2 ; hv sumsq + paddd m17, m3 + pmulld m16, m9 ; -a * 25 + pmulld m17, m9 + mova m0, [t1+r10*2+416*0] + paddw m1, m0, [t2+r10*2+416*0] + paddw m0, m0 + paddw m1, m0 ; hv sum + punpcklwd m0, m1, m4 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 164 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 + mova [t3+r10*4+ 24], xm17 + vextracti32x4 [t3+r10*4+ 56], m17, 2 + mova [t3+r10*4+ 72], m17 + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+ 4] + movu m1, [t3+r10*4+68] + paddd m2, m0, [t3+r10*4+ 0] + paddd m3, m1, [t3+r10*4+64] + paddd m2, [t3+r10*4+ 8] + paddd m3, [t3+r10*4+72] + paddd m0, m2 + pslld m2, 2 + paddd m1, m3 + pslld m3, 2 + paddd m2, m0 ; ab 565 + paddd m3, m1 + pandn m0, m13, m2 ; a + psrld m2, 12 ; b + pandn m1, m13, m3 + psrld m3, 12 + mova [t3+r10*4+416*4+ 0], m0 + mova [t3+r10*4+416*8+ 0], m2 + mova [t3+r10*4+416*4+64], m1 + mova [t3+r10*4+416*8+64], m3 + add r10, 32 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m16, [t3+r10*4+ 4] + movu m17, [t3+r10*4+68] + paddd m0, m16, [t3+r10*4+ 0] + paddd m1, m17, [t3+r10*4+64] + paddd m0, [t3+r10*4+ 8] + paddd m1, [t3+r10*4+72] + paddd m16, m0 + pslld m0, 2 + paddd m17, m1 + pslld m1, 2 + paddd m0, m16 + paddd m1, m17 + pandn m16, m13, m0 + psrld m0, 12 + pandn m17, m13, m1 + psrld m1, 12 + paddd m2, m16, [t3+r10*4+416*4+ 0] ; a + paddd m3, m17, [t3+r10*4+416*4+64] + mova [t3+r10*4+416*4+ 0], m16 + mova [t3+r10*4+416*4+64], m17 + paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8) + paddd m17, m1, [t3+r10*4+416*8+64] + mova [t3+r10*4+416*8+ 0], m0 + mova [t3+r10*4+416*8+64], m1 + pmovzxbd m0, [dstq+r10+ 0] + pmovzxbd m1, [dstq+r10+16] + pmaddwd m2, m0 ; a * src + pmaddwd m3, m1 + packssdw m0, m1 + psubd m16, m2 ; b - a * src + (1 << 8) + psubd m17, m3 + psrad m16, 9 + psrad m17, 9 + packssdw m16, m17 + pmulhrsw m16, m15 + paddw m16, m0 + packuswb m16, m16 + vpermd m16, m22, m16 + mova [dstq+r10], ym16 + add r10, 32 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + pmovzxbd m0, [dstq+r10+ 0] + pmovzxbd m1, [dstq+r10+16] + pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src + pmaddwd m3, m1, [t3+r10*4+416*4+64] + mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7) + mova m17, [t3+r10*4+416*8+64] + packssdw m0, m1 + psubd m16, m2 ; b - a * src + (1 << 7) + psubd m17, m3 + psrad m16, 8 + psrad m17, 8 + packssdw m16, m17 + pmulhrsw m16, m15 + paddw m16, m0 + packuswb m16, m16 + vpermd m16, m22, m16 + mova [dstq+r10], ym16 + add r10, 32 + jl .n1_loop + add dstq, strideq + ret + +cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \ + w, h, edge, params + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti32x4 m5, [sgr_shuf+3] + add lpfq, wq + vbroadcasti32x4 m6, [sgr_shuf+5] + add dstq, wq + vbroadcasti32x4 m7, [sgr_shuf+7] + pxor m4, m4 + vpbroadcastd m8, [pd_m9] + vpsubd m11, m4, [paramsq+4] {1to16} ; -s1 + vpbroadcastw m15, [paramsq+10] ; w1 + lea t1, [rsp+wq*2+20] + vpbroadcastd m10, [pw_164_455] + lea t3, [rsp+wq*4+16+416*12] + vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) + neg wq + vpbroadcastd m13, [pd_m4096] + mov r10d, 0xfe + vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) + kmovb k1, r10d + mova m18, [sgr_x_by_x+64*0] + mov r10, 0x3333333333333333 + mova m19, [sgr_x_by_x+64*1] + kmovq k2, r10 + mova m20, [sgr_x_by_x+64*2] + psllw m15, 4 + mova m21, [sgr_x_by_x+64*3] + lea r14, [r_ext_mask+75] + mova ym9, [sgr_shuf] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + add t1, 416*6 + call .h_top + lea t4, [lpfq+strideq*4] + mov lpfq, dstq + add t4, strideq + mov [rsp], t4 ; below + mov t0, t2 + call .hv +.main: + mov t5, t3 + add t3, 416*4 + dec hd + jz .height1 + add lpfq, strideq + call .hv + call .prep_n + dec hd + jz .extend_bottom +.main_loop: + add lpfq, strideq + call .hv + call .n + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv_bottom + call .n + add lpfq, strideq + call .hv_bottom +.end: + call .n + RET +.height1: + call .v + call .prep_n + mov t2, t1 + call .v + jmp .end +.extend_bottom: + call .v + call .n + mov t2, t1 + call .v + jmp .end +.no_top: + lea t4, [lpfq+strideq*4] + mov lpfq, dstq + lea t4, [t4+strideq*2] + mov [rsp], t4 + call .h + lea t0, [t1+416*6] + mov t2, t1 + call .v + jmp .main +.h: ; horizontal boxsum + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu ym17, [lpfq+r10-2] +.h_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -33 + jl .h_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r14+r10-8] + vinserti32x8 m16, [r14+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.h_have_right: + pshufb m0, m17, m5 + pmullw m2, m0, m0 + pshufb m16, m17, m6 + paddw m0, m16 + pshufb m17, m7 + paddw m0, m17 ; sum + punpcklwd m3, m16, m17 + punpcklwd m1, m2, m4 + vpdpwssd m1, m3, m3 ; sumsq + punpckhwd m16, m17 + punpckhwd m2, m4 + vpdpwssd m2, m16, m16 + mova [t1+r10*2+416*0], m0 + mova [t1+r10*2+416*2], m1 + mova [t1+r10*2+416*4], m2 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv_main +.hv_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv_main +.hv_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu ym17, [lpfq+r10-2] +.hv_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp r10d, -33 + jl .hv_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r14+r10-8] + vinserti32x8 m16, [r14+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv_have_right: + pshufb m0, m17, m5 + pmullw m3, m0, m0 + pshufb m1, m17, m6 + paddw m0, m1 + pshufb m17, m7 + paddw m0, m17 ; h sum + punpcklwd m16, m17, m1 + punpcklwd m2, m3, m4 + vpdpwssd m2, m16, m16 ; h sumsq + punpckhwd m17, m1 + punpckhwd m3, m4 + vpdpwssd m3, m17, m17 + paddw m1, m0, [t2+r10*2+416*0] + paddw m1, [t1+r10*2+416*0] ; hv sum + paddd m16, m2, [t2+r10*2+416*2] + paddd m17, m3, [t2+r10*2+416*4] + paddd m16, [t1+r10*2+416*2] ; hv sumsq + paddd m17, [t1+r10*2+416*4] + mova [t0+r10*2+416*0], m0 + mova [t0+r10*2+416*2], m2 + mova [t0+r10*2+416*4], m3 + pmulld m16, m8 ; -a * 9 + pmulld m17, m8 + punpcklwd m0, m4, m1 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 + mova [t3+r10*4+ 24], xm17 + vextracti32x4 [t3+r10*4+ 56], m17, 2 + mova [t3+r10*4+ 72], m17 + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + ret +.v: ; vertical boxsum + ab + lea r10, [wq-2] +.v_loop: + mova m16, [t1+r10*2+416*2] + mova m17, [t1+r10*2+416*4] + paddd m16, m16 + paddd m17, m17 + paddd m16, [t2+r10*2+416*2] ; hv sumsq + paddd m17, [t2+r10*2+416*4] + pmulld m16, m8 ; -a * 9 + pmulld m17, m8 + mova m1, [t1+r10*2+416*0] + paddw m1, m1 + paddw m1, [t2+r10*2+416*0] ; hv sum + punpcklwd m0, m4, m1 ; b + vpdpwssd m16, m0, m0 ; -p + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + pmulld m16, m11 ; p * s + pmulld m17, m11 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m20 + paddusw m17, m12 + psraw m17, 4 ; min(z, 255) - 256 + vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x + pandn m16, m13, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m14 + vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) + vpternlogd m17, m1, m13, 0xd8 + mova [t3+r10*4+ 8], m16 + mova [t3+r10*4+ 24], xm17 + vextracti32x4 [t3+r10*4+ 56], m17, 2 + mova [t3+r10*4+ 72], m17 + vextracti128 [t3+r10*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+104], m16, 3 + add r10, 32 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + mov r10, wq + mov t4, t3 + add t3, 416*4 +.prep_n_loop: + mova m2, [t5+r10*4+0] + mova m3, [t4+r10*4+0] + paddd m2, [t5+r10*4+8] + paddd m3, [t4+r10*4+8] + paddd m0, m2, [t5+r10*4+4] + paddd m1, m3, [t4+r10*4+4] + pslld m0, 2 + paddd m1, m1 ; ab[ 0] 222 + psubd m0, m2 ; ab[-1] 343 + mova [t3+r10*4+416*4], m1 + paddd m1, m1 + mova [t5+r10*4], m0 + psubd m1, m3 ; ab[ 0] 343 + mova [t4+r10*4], m1 + add r10, 16 + jl .prep_n_loop + ret +; a+b are packed together in a single dword, but we can't do the +; full neighbor calculations before splitting them since we don't +; have sufficient precision. The solution is to do the calculations +; in two equal halves and split a and b before doing the final sum. +ALIGN function_align +.n: ; neighbor + output + mov r10, wq +.n_loop: + mova m16, [t3+r10*4+ 0] + paddd m16, [t3+r10*4+ 8] + paddd m17, m16, [t3+r10*4+ 4] + paddd m17, m17 ; ab[+1] 222 + mova m2, [t3+r10*4+416*4+ 0] + paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 + mova m3, [t3+r10*4+416*4+64] + paddd m1, m3, [t5+r10*4+64] + mova [t3+r10*4+416*4+ 0], m17 + paddd m17, m17 + psubd m17, m16 ; ab[+1] 343 + mova [t5+r10*4+ 0], m17 + paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343 + mova m16, [t3+r10*4+64] + paddd m16, [t3+r10*4+72] + paddd m17, m16, [t3+r10*4+68] + paddd m17, m17 + mova [t3+r10*4+416*4+64], m17 + paddd m17, m17 + psubd m17, m16 + mova [t5+r10*4+64], m17 + pandn m16, m13, m0 + psrld m0, 12 + paddd m3, m17 + pandn m17, m13, m2 + psrld m2, 12 + paddd m16, m17 ; a + pandn m17, m13, m1 + psrld m1, 12 + paddd m0, m2 ; b + (1 << 8) + pandn m2, m13, m3 + psrld m3, 12 + paddd m17, m2 + pmovzxbd m2, [dstq+r10+ 0] + paddd m1, m3 + pmovzxbd m3, [dstq+r10+16] + pmaddwd m16, m2 ; a * src + pmaddwd m17, m3 + packssdw m2, m3 + psubd m0, m16 ; b - a * src + (1 << 8) + psubd m1, m17 + psrad m0, 9 + psrad m1, 9 + packssdw m0, m1 + pmulhrsw m0, m15 + paddw m0, m2 + packuswb m0, m0 + vpermd m16, m9, m0 + mova [dstq+r10], ym16 + add r10, 32 + jl .n_loop + mov r10, t5 + mov t5, t4 + mov t4, r10 + add dstq, strideq + ret + +cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \ + w, h, edge, params + mov paramsq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + vbroadcasti128 m5, [sgr_shuf+1] + add lpfq, wq + vbroadcasti128 m6, [sgr_shuf+9] + add dstq, wq + vbroadcasti128 m7, [sgr_shuf+3] + lea t3, [rsp+wq*4+416*24+8] + vbroadcasti128 m8, [sgr_shuf+7] + pxor m4, m4 + vpbroadcastd m9, [pd_m9] + vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 + vpbroadcastd m14, [pw_61448] + vpsubd m12, m4, [paramsq+4] {1to16} ; -s1 + vpbroadcastd m26, [paramsq+8] ; w0 w1 + lea t1, [rsp+wq*2+12] + vpbroadcastd m10, [pd_m25] + neg wq + vpbroadcastd m13, [pw_164_455] + mov r10d, 0xfe + vpbroadcastd m15, [pd_34816] + kmovb k1, r10d + mova m20, [sgr_x_by_x+64*0] + mov r10, 0x3333333333333333 + mova m21, [sgr_x_by_x+64*1] + kmovq k2, r10 + mova m22, [sgr_x_by_x+64*2] + lea r12, [r_ext_mask+75] + mova m23, [sgr_x_by_x+64*3] + vpbroadcastd m24, [pd_m4096] + vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____ + psllw m26, 5 + mova xm27, [sgr_mix_perm] + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t2, t1 + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup + add t1, 416*12 + call .h_top + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov [rsp], r10 ; below + call .hv0 +.main: + dec hd + jz .height1 + add lpfq, strideq + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + add lpfq, strideq + call .hv0 + test hd, hd + jz .odd_height + add lpfq, strideq + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, [rsp] + call .hv0_bottom + add lpfq, strideq + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov [rsp], r10 + call .h + lea t2, [t1+416*12] + lea r10, [wq-2] +.top_fixup_loop: + mova m0, [t1+r10*2+416* 0] + mova m1, [t1+r10*2+416* 2] + mova m2, [t1+r10*2+416* 4] + paddw m0, m0 + mova m3, [t1+r10*2+416* 6] + paddd m1, m1 + mova m16, [t1+r10*2+416* 8] + paddd m2, m2 + mova m17, [t1+r10*2+416*10] + mova [t2+r10*2+416* 0], m0 + mova [t2+r10*2+416* 2], m1 + mova [t2+r10*2+416* 4], m2 + mova [t2+r10*2+416* 6], m3 + mova [t2+r10*2+416* 8], m16 + mova [t2+r10*2+416*10], m17 + add r10, 32 + jl .top_fixup_loop + call .v0 + jmp .main +.h: ; horizontal boxsums + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .h_main +.h_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .h_main +.h_top: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu ym17, [lpfq+r10-2] +.h_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp r10d, -34 + jl .h_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.h_have_right: + pshufb m3, m17, m5 + pshufb m18, m17, m6 + shufps m0, m3, m18, q2121 + pmullw m2, m0, m0 + pshufb m19, m17, m7 + paddw m0, m19 + pshufb m17, m8 + paddw m0, m17 ; sum3 + punpcklwd m16, m19, m17 + punpcklwd m1, m2, m4 + vpdpwssd m1, m16, m16 ; sumsq3 + punpckhwd m19, m17 + punpckhwd m2, m4 + vpdpwssd m2, m19, m19 + mova [t1+r10*2+416* 6], m0 + mova [t1+r10*2+416* 8], m1 + mova [t1+r10*2+416*10], m2 + punpcklwd m19, m3, m18 + paddw m0, m3 + vpdpwssd m1, m19, m19 ; sumsq5 + punpckhwd m3, m18 + paddw m0, m18 ; sum5 + vpdpwssd m2, m3, m3 + mova [t1+r10*2+416* 0], m0 + mova [t1+r10*2+416* 2], m1 + mova [t1+r10*2+416* 4], m2 + add r10, 32 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv0_main +.hv0_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv0_main +.hv0_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left +.hv0_loop: + movu ym17, [lpfq+r10-2] +.hv0_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp r10d, -34 + jl .hv0_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv0_have_right: + pshufb m18, m17, m5 + pshufb m19, m17, m6 + shufps m1, m18, m19, q2121 + pmullw m3, m1, m1 + pshufb m0, m17, m7 + paddw m1, m0 + pshufb m17, m8 + paddw m1, m17 ; sum3 + punpcklwd m16, m0, m17 + punpcklwd m2, m3, m4 + vpdpwssd m2, m16, m16 ; sumsq3 + punpckhwd m0, m17 + punpckhwd m3, m4 + vpdpwssd m3, m0, m0 + paddw m0, m1, [t1+r10*2+416* 6] + paddd m16, m2, [t1+r10*2+416* 8] + paddd m17, m3, [t1+r10*2+416*10] + mova [t1+r10*2+416* 6], m1 + mova [t1+r10*2+416* 8], m2 + mova [t1+r10*2+416*10], m3 + paddw m1, m18 + paddw m1, m19 ; sum5 + mova [t3+r10*4+416*8+ 8], m1 + paddw m1, [t1+r10*2+416* 0] + mova [t1+r10*2+416* 0], m1 + punpcklwd m1, m18, m19 + vpdpwssd m2, m1, m1 ; sumsq5 + punpckhwd m18, m19 + vpdpwssd m3, m18, m18 + mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row + mova [t3+r10*4+416*0+72], m3 ; in case height is odd + paddd m2, [t1+r10*2+416* 2] + paddd m3, [t1+r10*2+416* 4] + mova [t1+r10*2+416* 2], m2 + mova [t1+r10*2+416* 4], m3 + paddw m1, m0, [t2+r10*2+416* 6] + paddd m2, m16, [t2+r10*2+416* 8] + paddd m3, m17, [t2+r10*2+416*10] + mova [t2+r10*2+416* 6], m0 + mova [t2+r10*2+416* 8], m16 + mova [t2+r10*2+416*10], m17 + pmulld m16, m2, m9 ; -a3 * 9 + pmulld m17, m3, m9 + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m22 + paddusw m17, m14 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*4+ 8], m16 + mova [t3+r10*4+416*4+ 24], xm17 + vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 + mova [t3+r10*4+416*4+ 72], m17 + vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*4+104], m16, 3 + add r10, 32 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movd xm17, [leftq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + add leftq, 4 + jmp .hv1_main +.hv1_extend_left: + vpbroadcastb xm17, [lpfq+wq] + vmovdqu32 ym17{k1}, [lpfq+wq-4] + jmp .hv1_main +.hv1_bottom: + lea r10, [wq-2] + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left +.hv1_loop: + movu ym17, [lpfq+r10-2] +.hv1_main: + vinserti32x8 m17, [lpfq+r10+6], 1 + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp r10d, -34 + jl .hv1_have_right + vpbroadcastb m0, [lpfq-1] + movu ym16, [r12+r10-8] + vinserti32x8 m16, [r12+r10+0], 1 + vpternlogd m17, m0, m16, 0xe4 +.hv1_have_right: + pshufb m3, m17, m5 + pshufb m19, m17, m6 + shufps m2, m3, m19, q2121 + pmullw m1, m2, m2 + pshufb m18, m17, m7 + paddw m2, m18 + pshufb m17, m8 + paddw m2, m17 ; sum3 + punpcklwd m16, m17, m18 + punpcklwd m0, m1, m4 + vpdpwssd m0, m16, m16 ; sumsq3 + punpckhwd m17, m18 + punpckhwd m1, m4 + vpdpwssd m1, m17, m17 + paddd m16, m0, [t2+r10*2+416* 8] + paddd m17, m1, [t2+r10*2+416*10] + mova [t2+r10*2+416* 8], m0 + mova [t2+r10*2+416*10], m1 + punpcklwd m18, m3, m19 + vpdpwssd m0, m18, m18 ; sumsq5 + punpckhwd m18, m3, m19 + vpdpwssd m1, m18, m18 + paddw m3, m19 + pmulld m16, m9 ; -a3 * 9 + pmulld m17, m9 + paddd m18, m0, [t2+r10*2+416*2] + paddd m19, m1, [t2+r10*2+416*4] + paddd m18, [t1+r10*2+416*2] + paddd m19, [t1+r10*2+416*4] + mova [t2+r10*2+416*2], m0 + mova [t2+r10*2+416*4], m1 + pmulld m18, m10 ; -a5 * 25 + pmulld m19, m10 + paddw m1, m2, [t2+r10*2+416* 6] + mova [t2+r10*2+416* 6], m2 + paddw m2, m3 ; sum5 + paddw m3, m2, [t2+r10*2+416*0] + paddw m3, [t1+r10*2+416*0] + mova [t2+r10*2+416*0], m2 + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + punpcklwd m2, m3, m4 ; b5 + vpdpwssd m18, m2, m2 ; -p5 + punpckhwd m3, m4 + vpdpwssd m19, m3, m3 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmulld m18, m11 ; p5 * s0 + pmulld m19, m11 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + pmaddwd m2, m13 ; b5 * 164 + pmaddwd m3, m13 + vpalignr m17{k2}, m16, m16, 2 + vpalignr m19{k2}, m18, m18, 2 + paddusw m17, m14 + mova m16, m22 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + paddusw m19, m14 + mova m18, m22 + psraw m19, 4 ; min(z5, 255) - 256 + vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] + vpmovb2m k4, m19 + vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + vmovdqu8 m19{k4}, m18 ; x5 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + pandn m18, m24, m19 + psrld m19, 16 + pmulld m2, m18 + pmulld m3, m19 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*8+ 8], m16 + mova [t3+r10*4+416*8+ 24], xm17 + vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 + paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m3, m15 + mova [t3+r10*4+416*8+ 72], m17 + vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*8+104], m16, 3 + vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) + vpternlogd m19, m3, m24, 0xd8 + mova [t3+r10*4+416*0+ 8], m18 + mova [t3+r10*4+416*0+ 24], xm19 + vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 + mova [t3+r10*4+416*0+ 72], m19 + vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 + vextracti32x4 [t3+r10*4+416*0+104], m18, 3 + add r10, 32 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) + lea r10, [wq-2] +.v0_loop: + mova m2, [t1+r10*2+416* 8] + mova m3, [t1+r10*2+416*10] + paddd m2, m2 + paddd m3, m3 + paddd m16, m2, [t2+r10*2+416* 8] + paddd m17, m3, [t2+r10*2+416*10] + mova m0, [t1+r10*2+416* 6] + paddw m0, m0 + paddw m1, m0, [t2+r10*2+416* 6] + pmulld m16, m9 ; -a3 * 9 + pmulld m17, m9 + mova [t2+r10*2+416* 6], m0 + mova [t2+r10*2+416* 8], m2 + mova [t2+r10*2+416*10], m3 + mova m2, [t1+r10*2+416*0] + mova m3, [t1+r10*2+416*2] + mova m18, [t1+r10*2+416*4] + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + mova [t3+r10*4+416*8+ 8], m2 + mova [t3+r10*4+416*0+ 8], m3 + mova [t3+r10*4+416*0+72], m18 + vpalignr m17{k2}, m16, m16, 2 + mova m16, m22 + paddusw m17, m14 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + paddw m2, m2 ; cc5 + paddd m3, m3 + paddd m18, m18 + mova [t1+r10*2+416*0], m2 + mova [t1+r10*2+416*2], m3 + mova [t1+r10*2+416*4], m18 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*4+ 8], m16 + mova [t3+r10*4+416*4+ 24], xm17 + vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 + mova [t3+r10*4+416*4+ 72], m17 + vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*4+104], m16, 3 + add r10, 32 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) + lea r10, [wq-2] +.v1_loop: + mova m0, [t1+r10*2+416* 8] + paddd m16, m0, [t2+r10*2+416* 8] + mova m1, [t1+r10*2+416*10] + paddd m17, m1, [t2+r10*2+416*10] + mova m2, [t3+r10*4+416*0+ 8] + paddd m18, m2, [t2+r10*2+416* 2] + mova m3, [t3+r10*4+416*0+72] + paddd m19, m3, [t2+r10*2+416* 4] + paddd m18, [t1+r10*2+416* 2] + paddd m19, [t1+r10*2+416* 4] + mova [t2+r10*2+416* 8], m0 + mova [t2+r10*2+416*10], m1 + mova [t2+r10*2+416* 2], m2 + mova [t2+r10*2+416* 4], m3 + pmulld m16, m9 ; -a3 * 9 + pmulld m17, m9 + pmulld m18, m10 ; -a5 * 25 + pmulld m19, m10 + mova m0, [t1+r10*2+416* 6] + paddw m1, m0, [t2+r10*2+416* 6] + mova m2, [t3+r10*4+416*8+ 8] + paddw m3, m2, [t2+r10*2+416*0] + paddw m3, [t1+r10*2+416*0] + mova [t2+r10*2+416* 6], m0 + mova [t2+r10*2+416*0], m2 + punpcklwd m0, m4, m1 ; b3 + vpdpwssd m16, m0, m0 ; -p3 + punpckhwd m1, m4, m1 + vpdpwssd m17, m1, m1 + punpcklwd m2, m3, m4 ; b5 + vpdpwssd m18, m2, m2 ; -p5 + punpckhwd m3, m4 + vpdpwssd m19, m3, m3 + pmulld m16, m12 ; p3 * s1 + pmulld m17, m12 + pmulld m18, m11 ; p5 * s0 + pmulld m19, m11 + pmaddwd m0, m13 ; b3 * 455 + pmaddwd m1, m13 + pmaddwd m2, m13 ; b5 * 164 + pmaddwd m3, m13 + vpalignr m17{k2}, m16, m16, 2 + vpalignr m19{k2}, m18, m18, 2 + paddusw m17, m14 + mova m16, m22 + psraw m17, 4 ; min(z3, 255) - 256 + vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] + vpmovb2m k3, m17 + vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] + paddusw m19, m14 + mova m18, m22 + psraw m19, 4 ; min(z5, 255) - 256 + vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] + vpmovb2m k4, m19 + vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] + vmovdqu8 m17{k3}, m16 ; x3 + vmovdqu8 m19{k4}, m18 ; x5 + pandn m16, m24, m17 + psrld m17, 16 + pmulld m0, m16 + pmulld m1, m17 + pandn m18, m24, m19 + psrld m19, m19, 16 + pmulld m2, m18 + pmulld m3, m19 + paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m15 + vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) + vpternlogd m17, m1, m24, 0xd8 + mova [t3+r10*4+416*8+ 8], m16 + mova [t3+r10*4+416*8+ 24], xm17 + vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 + paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m3, m15 + mova [t3+r10*4+416*8+ 72], m17 + vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 + vextracti32x4 [t3+r10*4+416*8+104], m16, 3 + vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) + vpternlogd m19, m3, m24, 0xd8 + mova [t3+r10*4+416*0+ 8], m18 + mova [t3+r10*4+416*0+ 24], xm19 + vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 + mova [t3+r10*4+416*0+ 72], m19 + vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 + vextracti32x4 [t3+r10*4+416*0+104], m18, 3 + add r10, 32 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + mov r10, wq +.prep_n_loop: + movu m0, [t3+r10*4+416*0+4] + paddd m1, m0, [t3+r10*4+416*0+0] + mova m16, [t3+r10*4+416*4+0] + paddd m1, [t3+r10*4+416*0+8] + mova m17, [t3+r10*4+416*8+0] + paddd m16, [t3+r10*4+416*4+8] + paddd m17, [t3+r10*4+416*8+8] + paddd m2, m16, [t3+r10*4+416*4+4] + paddd m3, m17, [t3+r10*4+416*8+4] + paddd m0, m1 + pslld m1, 2 + pslld m2, 2 + paddd m1, m0 ; ab5 565 + paddd m3, m3 ; ab3[ 0] 222 + psubd m2, m16 ; ab3[-1] 343 + mova [t3+r10*4+416*20], m3 + pandn m0, m24, m1 ; a5 565 + mova [t3+r10*4+416*24], m2 + psrld m1, 12 ; b5 565 + mova [t3+r10*4+416*12], m0 + paddd m3, m3 + mova [t3+r10*4+416*16], m1 + psubd m3, m17 ; ab3[ 0] 343 + mova [t3+r10*4+416*28], m3 + add r10, 16 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + mov r10, wq +.n0_loop: + movu m2, [t3+r10*4+4] + paddd m3, m2, [t3+r10*4+0] + paddd m3, [t3+r10*4+8] + mova m1, [t3+r10*4+416*4+0] + paddd m2, m3 + pslld m3, 2 + paddd m1, [t3+r10*4+416*4+8] + paddd m3, m2 + pandn m2, m24, m3 + psrld m3, 12 + paddd m0, m2, [t3+r10*4+416*12] ; a5 + paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8) + mova [t3+r10*4+416*12], m2 + mova [t3+r10*4+416*16], m3 + paddd m2, m1, [t3+r10*4+416*4+4] + paddd m2, m2 ; ab3[ 1] 222 + mova m3, [t3+r10*4+416*20] + paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343 + mova [t3+r10*4+416*20], m2 + paddd m2, m2 + psubd m2, m1 ; ab3[ 1] 343 + mova [t3+r10*4+416*24], m2 + paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343 + pandn m1, m24, m17 + psrld m17, 12 + pandn m3, m24, m2 + psrld m2, 12 + paddd m1, m3 ; a3 + pmovzxbd m3, [dstq+r10] + paddd m17, m2 ; b3 + (1 << 8) + pmaddwd m0, m3 ; a5 * src + pmaddwd m1, m3 ; a3 * src + vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) + psubd m16, m0 ; b5 - a5 * src + (1 << 8) + psubd m17, m1 ; b3 - a3 * src + (1 << 8) + psrld m16, 9 + pslld m17, 7 + vmovdqu8 m17{k2}, m16 + vpdpwssd m3, m17, m26 + packuswb m3, m2 + vpermb m16, m27, m3 + mova [dstq+r10], xm16 + add r10, 16 + jl .n0_loop + add dstq, strideq + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + mov r10, wq +.n1_loop: + mova m1, [t3+r10*4+416*8+0] + paddd m1, [t3+r10*4+416*8+8] + paddd m2, m1, [t3+r10*4+416*8+4] + paddd m2, m2 ; ab3[ 1] 222 + mova m0, [t3+r10*4+416*20] + paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343 + pmovzxbd m3, [dstq+r10] + mova [t3+r10*4+416*20], m2 + paddd m2, m2 + psubd m2, m1 ; ab3[ 1] 343 + mova [t3+r10*4+416*28], m2 + paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343 + pandn m1, m24, m17 + psrld m17, 12 + pandn m2, m24, m0 + psrld m0, 12 + paddd m1, m2 ; a3 + paddd m17, m0 ; b3 + (1 << 8) + mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7) + pmaddwd m1, m3 ; a3 * src + pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src + vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) + psubd m17, m1 ; b3 - a3 * src + (1 << 8) + psubd m16, m0 ; b5 - a5 * src + (1 << 7) + pslld m17, 7 + palignr m17{k2}, m16, m16, 1 + vpdpwssd m3, m17, m26 + packuswb m3, m3 + vpermb m16, m27, m3 + mova [dstq+r10], xm16 + add r10, 16 + jl .n1_loop + add dstq, strideq + ret + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/looprestoration_sse.asm b/third_party/dav1d/src/x86/looprestoration_sse.asm new file mode 100644 index 0000000000..01eb6fa348 --- /dev/null +++ b/third_party/dav1d/src/x86/looprestoration_sse.asm @@ -0,0 +1,3681 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; Copyright © 2018, VideoLabs +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4 +wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 +wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 +wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 +wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 +sgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 +sgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +pb_right_ext_mask: times 24 db 0xff + times 8 db 0 +pb_1: times 16 db 1 +pb_3: times 16 db 3 +pw_256: times 8 dw 256 +pw_2056: times 8 dw 2056 +pw_m16380: times 8 dw -16380 +pd_4096: times 4 dd 4096 +pd_34816: times 4 dd 34816 +pd_0xffff: times 4 dd 0xffff +pd_0xf00800a4: times 4 dd 0xf00800a4 +pd_0xf00801c7: times 4 dd 0xf00801c7 + +cextern sgr_x_by_x + +SECTION .text + +%macro movif64 2 ; dst, src + %if ARCH_X86_64 + mov %1, %2 + %endif +%endmacro + +%macro movif32 2 ; dst, src + %if ARCH_X86_32 + mov %1, %2 + %endif +%endmacro + +%if ARCH_X86_32 + %define PIC_base_offset $$ + + %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg + %assign pic_reg_stk_off 4 + %xdefine PIC_reg %1 + %if %2 == 1 + mov [esp], %1 + %endif + LEA PIC_reg, PIC_base_offset + %if %3 == 1 + XCHG_PIC_REG + %endif + %endmacro + + %macro XCHG_PIC_REG 0 + mov [esp+pic_reg_stk_off], PIC_reg + %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 + mov PIC_reg, [esp+pic_reg_stk_off] + %endmacro + + %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) + +%else + %macro XCHG_PIC_REG 0 + %endmacro + + %define PIC_sym(sym) (sym) +%endif + +%macro WIENER 0 +%if ARCH_X86_64 +DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers +cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ + w, h, edge, flt, x + %define tmpstrideq strideq + %define base 0 + mov fltq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + movq m14, [fltq] + add lpfq, wq + movq m7, [fltq+16] + add dstq, wq + lea t1, [rsp+wq*2+16] + mova m15, [pw_2056] + neg wq +%if cpuflag(ssse3) + pshufb m14, [wiener_init] + mova m8, [wiener_shufA] + pshufd m12, m14, q2222 ; x0 x0 + mova m9, [wiener_shufB] + pshufd m13, m14, q3333 ; x1 x2 + mova m10, [wiener_shufC] + punpcklqdq m14, m14 ; x3 + mova m11, [wiener_shufD] +%else + mova m10, [pw_m16380] + punpcklwd m14, m14 + pshufd m11, m14, q0000 ; x0 + pshufd m12, m14, q1111 ; x1 + pshufd m13, m14, q2222 ; x2 + pshufd m14, m14, q3333 ; x3 +%endif +%else +DECLARE_REG_TMP 4, 0, _, 5 +%if cpuflag(ssse3) + %define m10 [base+wiener_shufC] + %define m11 [base+wiener_shufD] + %define stk_off 96 +%else + %define m10 [base+pw_m16380] + %define m11 [stk+96] + %define stk_off 112 +%endif +cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride + %define base r6-pb_right_ext_mask-21 + %define stk esp + %define dstq leftq + %define edgeb byte edged + %define edged [stk+ 8] + %define dstmp [stk+12] + %define hd dword [stk+16] + %define wq [stk+20] + %define strideq [stk+24] + %define leftmp [stk+28] + %define t2 [stk+32] + %define t4 [stk+36] + %define t5 [stk+40] + %define t6 [stk+44] + %define m8 [base+wiener_shufA] + %define m9 [base+wiener_shufB] + %define m12 [stk+48] + %define m13 [stk+64] + %define m14 [stk+80] + %define m15 [base+pw_2056] + mov r1, r6m ; flt + mov r0, r0m ; dst + mov r4, r4m ; w + mov lpfq, lpfm + mov r2, r7m ; edge + mov r5, r5m ; h + movq m3, [r1+ 0] + movq m7, [r1+16] + add r0, r4 + mov r1, r1m ; stride + add lpfq, r4 + mov edged, r2 + mov r2, r2m ; left + mov dstmp, r0 + lea t1, [rsp+r4*2+stk_off] + mov hd, r5 + neg r4 + LEA r6, pb_right_ext_mask+21 + mov wq, r4 + mov strideq, r1 + mov leftmp, r2 + mov r4, r1 +%if cpuflag(ssse3) + pshufb m3, [base+wiener_init] + pshufd m1, m3, q2222 + pshufd m2, m3, q3333 + punpcklqdq m3, m3 +%else + punpcklwd m3, m3 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m11, m0 +%endif + mova m12, m1 + mova m13, m2 + mova m14, m3 +%endif + psllw m7, 5 + pshufd m6, m7, q0000 ; y0 y1 + pshufd m7, m7, q1111 ; y2 y3 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t6, t1 + mov t5, t1 + add t1, 384*2 + call .h_top + lea t3, [lpfq+tmpstrideq*4] + mov lpfq, dstmp + add t3, tmpstrideq + mov [rsp], t3 ; below + mov t4, t1 + add t1, 384*2 + call .h + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 +.main: + lea t0, [t1+384*2] +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v3 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.v1: + call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v + RET +.no_top: + lea t3, [lpfq+tmpstrideq*4] + mov lpfq, dstmp + lea t3, [t3+tmpstrideq*2] + mov [rsp], t3 + call .h + mov t6, t1 + mov t5, t1 + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + mov t2, t1 + dec hd + jz .v2 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v3 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v3 + add t0, 384*8 + call .hv + dec hd + jnz .main +.v3: + call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v +.v2: + call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v + jmp .v1 +.extend_right: + movd m2, [lpfq-4] +%if ARCH_X86_64 + push r0 + lea r0, [pb_right_ext_mask+21] + movu m0, [r0+xq+0] + movu m1, [r0+xq+8] + pop r0 +%else + movu m0, [r6+xq+0] + movu m1, [r6+xq+8] +%endif +%if cpuflag(ssse3) + pshufb m2, [base+pb_3] +%else + punpcklbw m2, m2 + pshuflw m2, m2, q3333 + punpcklqdq m2, m2 +%endif + pand m4, m0 + pand m5, m1 + pandn m0, m2 + pandn m1, m2 + por m4, m0 + por m5, m1 + ret +.h: + %define stk esp+4 ; offset due to call + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .h_main +.h_extend_left: +%if cpuflag(ssse3) + mova m4, [lpfq+xq] + pshufb m4, [base+wiener_l_shuf] +%else + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .h_main +.h_top: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+xq-4] +.h_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp xd, -18 + jl .h_have_right + call .extend_right +.h_have_right: +%macro %%h7 0 +%if cpuflag(ssse3) + pshufb m0, m4, m8 + pmaddubsw m0, m12 + pshufb m1, m5, m8 + pmaddubsw m1, m12 + pshufb m2, m4, m9 + pmaddubsw m2, m13 + pshufb m3, m5, m9 + pmaddubsw m3, m13 + paddw m0, m2 + pshufb m2, m4, m10 + pmaddubsw m2, m13 + paddw m1, m3 + pshufb m3, m5, m10 + pmaddubsw m3, m13 + pshufb m4, m11 + paddw m0, m2 + pmullw m2, m14, m4 + pshufb m5, m11 + paddw m1, m3 + pmullw m3, m14, m5 + psllw m4, 7 + psllw m5, 7 + paddw m0, m2 + mova m2, [base+pw_m16380] + paddw m1, m3 + paddw m4, m2 + paddw m5, m2 + paddsw m0, m4 + paddsw m1, m5 +%else + psrldq m0, m4, 1 + pslldq m1, m4, 1 + pxor m3, m3 + punpcklbw m0, m3 + punpckhbw m1, m3 + paddw m0, m1 + pmullw m0, m11 + psrldq m1, m4, 2 + pslldq m2, m4, 2 + punpcklbw m1, m3 + punpckhbw m2, m3 + paddw m1, m2 + pmullw m1, m12 + paddw m0, m1 + pshufd m2, m4, q0321 + punpcklbw m2, m3 + pmullw m1, m14, m2 + paddw m0, m1 + psrldq m1, m4, 3 + pslldq m4, 3 + punpcklbw m1, m3 + punpckhbw m4, m3 + paddw m1, m4 + pmullw m1, m13 + paddw m0, m1 + psllw m2, 7 + paddw m2, m10 + paddsw m0, m2 + psrldq m1, m5, 1 + pslldq m2, m5, 1 + punpcklbw m1, m3 + punpckhbw m2, m3 + paddw m1, m2 + pmullw m1, m11 + psrldq m2, m5, 2 + pslldq m4, m5, 2 + punpcklbw m2, m3 + punpckhbw m4, m3 + paddw m2, m4 + pmullw m2, m12 + paddw m1, m2 + pshufd m4, m5, q0321 + punpcklbw m4, m3 + pmullw m2, m14, m4 + paddw m1, m2 + psrldq m2, m5, 3 + pslldq m5, 3 + punpcklbw m2, m3 + punpckhbw m5, m3 + paddw m2, m5 + pmullw m2, m13 + paddw m1, m2 + psllw m4, 7 + paddw m4, m10 + paddsw m1, m4 +%endif +%endmacro + %%h7 + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 + mova [t1+xq*2+ 0], m0 + mova [t1+xq*2+16], m1 + add xq, 16 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .hv_main +.hv_extend_left: +%if cpuflag(ssse3) + mova m4, [lpfq+xq] + pshufb m4, [base+wiener_l_shuf] +%else + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .hv_main +.hv_bottom: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+xq-4] +.hv_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp xd, -18 + jl .hv_have_right + call .extend_right +.hv_have_right: + %%h7 +%if ARCH_X86_64 + mova m2, [t4+xq*2] + paddw m2, [t2+xq*2] +%else + mov r2, t4 + mova m2, [r2+xq*2] + mov r2, t2 + paddw m2, [r2+xq*2] + mov r2, t5 +%endif + mova m3, [t3+xq*2] +%if ARCH_X86_64 + mova m5, [t5+xq*2] +%else + mova m5, [r2+xq*2] + mov r2, t6 +%endif + paddw m5, [t1+xq*2] + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 +%if ARCH_X86_64 + paddw m4, m0, [t6+xq*2] +%else + paddw m4, m0, [r2+xq*2] + mov r2, t4 +%endif + mova [t0+xq*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m5 + pmaddwd m3, m6 + punpckhwd m4, m5 + pmaddwd m4, m6 + paddd m0, m3 + mova m3, [t3+xq*2+16] + paddd m4, m2 +%if ARCH_X86_64 + mova m2, [t4+xq*2+16] + paddw m2, [t2+xq*2+16] + mova m5, [t5+xq*2+16] +%else + mova m2, [r2+xq*2+16] + mov r2, t2 + paddw m2, [r2+xq*2+16] + mov r2, t5 + mova m5, [r2+xq*2+16] + mov r2, t6 +%endif + paddw m5, [t1+xq*2+16] + packuswb m0, m4 +%if ARCH_X86_64 + paddw m4, m1, [t6+xq*2+16] +%else + paddw m4, m1, [r2+xq*2+16] + mov dstq, dstmp +%endif + mova [t0+xq*2+16], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m5 + pmaddwd m3, m6 + punpckhwd m4, m5 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .hv_loop + add dstq, strideq +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t6 +%else + mov dstmp, dstq + mov r1, t5 + mov r2, t4 + mov t6, r1 + mov t5, r2 + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, r1 +%endif + ret +%if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code +.v: + mov xq, wq +.v_loop: +%if ARCH_X86_64 + mova m1, [t4+xq*2] + paddw m1, [t2+xq*2] +%else + mov r2, t4 + mova m1, [r2+xq*2] + mov r2, t2 + paddw m1, [r2+xq*2] + mov r2, t6 +%endif + mova m2, [t3+xq*2] + mova m4, [t1+xq*2] +%if ARCH_X86_64 + paddw m3, m4, [t6+xq*2] + paddw m4, [t5+xq*2] +%else + paddw m3, m4, [r2+xq*2] + mov r2, t5 + paddw m4, [r2+xq*2] + mov r2, t4 +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m7 + punpckhwd m1, m2 + pmaddwd m1, m7 + punpcklwd m2, m3, m4 + pmaddwd m2, m6 + punpckhwd m3, m4 + pmaddwd m3, m6 + paddd m0, m2 + paddd m1, m3 +%if ARCH_X86_64 + mova m2, [t4+xq*2+16] + paddw m2, [t2+xq*2+16] +%else + mova m2, [r2+xq*2+16] + mov r2, t2 + paddw m2, [r2+xq*2+16] + mov r2, t6 +%endif + mova m3, [t3+xq*2+16] + mova m5, [t1+xq*2+16] +%if ARCH_X86_64 + paddw m4, m5, [t6+xq*2+16] + paddw m5, [t5+xq*2+16] +%else + paddw m4, m5, [r2+xq*2+16] + mov r2, t5 + paddw m5, [r2+xq*2+16] + movifnidn dstq, dstmp +%endif + packuswb m0, m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m5 + pmaddwd m3, m6 + punpckhwd m4, m5 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .v_loop + add dstq, strideq +%if ARCH_X86_64 + mov t6, t5 + mov t5, t4 +%else + mov dstmp, dstq + mov r1, t5 + mov r2, t4 + mov t6, r1 + mov t5, r2 +%endif + mov t4, t3 + mov t3, t2 + mov t2, t1 + ret +%endif + +%if ARCH_X86_64 +cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ + w, h, edge, flt, x + mov fltq, r6mp + mov wd, wm + movifnidn hd, hm + mov edged, r7m + movq m14, [fltq] + add lpfq, wq + movq m7, [fltq+16] + add dstq, wq + mova m8, [pw_m16380] + lea t1, [rsp+wq*2+16] + mova m15, [pw_2056] + neg wq +%if cpuflag(ssse3) + pshufb m14, [wiener_init] + mova m9, [wiener_shufB] + pshufd m13, m14, q3333 ; x1 x2 + mova m10, [wiener_shufC] + punpcklqdq m14, m14 ; x3 + mova m11, [wiener_shufD] + mova m12, [wiener_l_shuf] +%else + punpcklwd m14, m14 + pshufd m11, m14, q1111 ; x1 + pshufd m13, m14, q2222 ; x2 + pshufd m14, m14, q3333 ; x3 +%endif +%else +%if cpuflag(ssse3) + %define stk_off 80 +%else + %define m11 [stk+80] + %define stk_off 96 +%endif +cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride + %define stk esp + %define leftmp [stk+28] + %define m8 [base+pw_m16380] + %define m12 [base+wiener_l_shuf] + %define m14 [stk+48] + mov r1, r6m ; flt + mov r0, r0m ; dst + mov r4, r4m ; w + mov lpfq, lpfm + mov r2, r7m ; edge + mov r5, r5m ; h + movq m2, [r1+ 0] + movq m7, [r1+16] + add r0, r4 + mov r1, r1m ; stride + add lpfq, r4 + mov edged, r2 + mov r2, r2m ; left + mov dstmp, r0 + lea t1, [rsp+r4*2+stk_off] + mov hd, r5 + neg r4 + LEA r6, pb_right_ext_mask+21 + mov wq, r4 + mov strideq, r1 + mov leftmp, r2 + mov r4, r1 +%if cpuflag(ssse3) + pshufb m2, [base+wiener_init] + pshufd m1, m2, q3333 + punpcklqdq m2, m2 +%else + punpcklwd m2, m2 + pshufd m0, m2, q1111 + pshufd m1, m2, q2222 + pshufd m2, m2, q3333 + mova m11, m0 +%endif + mova m13, m1 + mova m14, m2 +%endif + psllw m7, 5 + pshufd m6, m7, q0000 ; __ y1 + pshufd m7, m7, q1111 ; y2 y3 + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, strideq + mov t4, t1 + add t1, 384*2 + call .h_top + lea xq, [lpfq+tmpstrideq*4] + mov lpfq, dstmp + mov t3, t1 + add t1, 384*2 + add xq, tmpstrideq + mov [rsp], xq ; below + call .h + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 +.main: + mov t0, t4 +.main_loop: + call .hv + dec hd + jnz .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .v2 + mov lpfq, [rsp] + call .hv_bottom + add lpfq, strideq + call .hv_bottom +.end: + RET +.no_top: + lea t3, [lpfq+tmpstrideq*4] + mov lpfq, dstmp + lea t3, [t3+tmpstrideq*2] + mov [rsp], t3 + call .h + mov t4, t1 + mov t3, t1 + mov t2, t1 + dec hd + jz .v1 + add lpfq, strideq + add t1, 384*2 + call .h + dec hd + jz .v2 + lea t0, [t1+384*2] + call .hv + dec hd + jz .v2 + add t0, 384*6 + call .hv + dec hd + jnz .main +.v2: + call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v + add dstq, strideq + mov t4, t3 + mov t3, t2 + mov t2, t1 + movifnidn dstmp, dstq +.v1: + call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v + jmp .end +.h: + %define stk esp+4 + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .h_main +.h_extend_left: +%if cpuflag(ssse3) + mova m4, [lpfq+xq] + pshufb m4, m12 +%else + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .h_main +.h_top: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left +.h_loop: + movu m4, [lpfq+xq-4] +.h_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp xd, -17 + jl .h_have_right + call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right +.h_have_right: +%macro %%h5 0 +%if cpuflag(ssse3) + pshufb m0, m4, m9 + pmaddubsw m0, m13 + pshufb m1, m5, m9 + pmaddubsw m1, m13 + pshufb m2, m4, m10 + pmaddubsw m2, m13 + pshufb m3, m5, m10 + pmaddubsw m3, m13 + pshufb m4, m11 + paddw m0, m2 + pmullw m2, m14, m4 + pshufb m5, m11 + paddw m1, m3 + pmullw m3, m14, m5 + psllw m4, 7 + psllw m5, 7 + paddw m4, m8 + paddw m5, m8 + paddw m0, m2 + paddw m1, m3 + paddsw m0, m4 + paddsw m1, m5 +%else + psrldq m0, m4, 2 + pslldq m1, m4, 2 + pxor m3, m3 + punpcklbw m0, m3 + punpckhbw m1, m3 + paddw m0, m1 + pmullw m0, m11 + pshufd m2, m4, q0321 + punpcklbw m2, m3 + pmullw m1, m14, m2 + paddw m0, m1 + psrldq m1, m4, 3 + pslldq m4, 3 + punpcklbw m1, m3 + punpckhbw m4, m3 + paddw m1, m4 + pmullw m1, m13 + paddw m0, m1 + psllw m2, 7 + paddw m2, m8 + paddsw m0, m2 + psrldq m1, m5, 2 + pslldq m4, m5, 2 + punpcklbw m1, m3 + punpckhbw m4, m3 + paddw m1, m4 + pmullw m1, m11 + pshufd m4, m5, q0321 + punpcklbw m4, m3 + pmullw m2, m14, m4 + paddw m1, m2 + psrldq m2, m5, 3 + pslldq m5, 3 + punpcklbw m2, m3 + punpckhbw m5, m3 + paddw m2, m5 + pmullw m2, m13 + paddw m1, m2 + psllw m4, 7 + paddw m4, m8 + paddsw m1, m4 +%endif +%endmacro + %%h5 + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 + mova [t1+xq*2+ 0], m0 + mova [t1+xq*2+16], m1 + add xq, 16 + jl .h_loop + ret +ALIGN function_align +.hv: + add lpfq, strideq + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movifnidn leftq, leftmp + mova m4, [lpfq+xq] + movd m5, [leftq] + add leftq, 4 + pslldq m4, 4 + por m4, m5 + movifnidn leftmp, leftq + jmp .hv_main +.hv_extend_left: +%if cpuflag(ssse3) + mova m4, [lpfq+xq] + pshufb m4, m12 +%else + mova m5, [lpfq+xq] + pshufd m4, m5, q2103 + punpcklbw m5, m5 + punpcklwd m5, m5 + movss m4, m5 +%endif + jmp .hv_main +.hv_bottom: + mov xq, wq + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left +.hv_loop: + movu m4, [lpfq+xq-4] +.hv_main: + movu m5, [lpfq+xq+4] + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp xd, -17 + jl .hv_have_right + call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right +.hv_have_right: + %%h5 + mova m2, [t3+xq*2] + paddw m2, [t1+xq*2] + psraw m0, 3 + psraw m1, 3 + paddw m0, m15 + paddw m1, m15 +%if ARCH_X86_64 + mova m3, [t2+xq*2] + paddw m4, m0, [t4+xq*2] +%else + mov r2, t2 + mova m3, [r2+xq*2] + mov r2, t4 + paddw m4, m0, [r2+xq*2] +%endif + mova [t0+xq*2], m0 + punpcklwd m0, m2, m3 + pmaddwd m0, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m4 + pmaddwd m3, m6 + punpckhwd m4, m4 + pmaddwd m4, m6 + paddd m0, m3 + paddd m4, m2 + mova m2, [t3+xq*2+16] + paddw m2, [t1+xq*2+16] + packuswb m0, m4 +%if ARCH_X86_64 + mova m3, [t2+xq*2+16] + paddw m4, m1, [t4+xq*2+16] +%else + paddw m4, m1, [r2+xq*2+16] + mov r2, t2 + mova m3, [r2+xq*2+16] + mov dstq, dstmp +%endif + mova [t0+xq*2+16], m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4, m4 + pmaddwd m3, m6 + punpckhwd m4, m4 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .hv_loop + add dstq, strideq + mov t4, t3 + mov t3, t2 + mov t2, t1 + mov t1, t0 + mov t0, t4 + movifnidn dstmp, dstq + ret +%if cpuflag(ssse3) +.v: + mov xq, wq +.v_loop: + mova m3, [t1+xq*2] + paddw m1, m3, [t3+xq*2] +%if ARCH_X86_64 + mova m2, [t2+xq*2] + paddw m3, [t4+xq*2] +%else + mov r2, t2 + mova m2, [r2+xq*2] + mov r2, t4 + paddw m3, [r2+xq*2] +%endif + punpcklwd m0, m1, m2 + pmaddwd m0, m7 + punpckhwd m1, m2 + pmaddwd m1, m7 + punpcklwd m2, m3 + pmaddwd m2, m6 + punpckhwd m3, m3 + pmaddwd m3, m6 + paddd m0, m2 + paddd m1, m3 + mova m4, [t1+xq*2+16] + paddw m2, m4, [t3+xq*2+16] +%if ARCH_X86_64 + mova m3, [t2+xq*2+16] + paddw m4, [t4+xq*2+16] +%else + paddw m4, [r2+xq*2+16] + mov r2, t2 + mova m3, [r2+xq*2+16] + mov dstq, dstmp +%endif + packuswb m0, m1 + punpcklwd m1, m2, m3 + pmaddwd m1, m7 + punpckhwd m2, m3 + pmaddwd m2, m7 + punpcklwd m3, m4 + pmaddwd m3, m6 + punpckhwd m4, m4 + pmaddwd m4, m6 + paddd m1, m3 + paddd m2, m4 + packuswb m1, m2 + psrlw m0, 8 + psrlw m1, 8 + packuswb m0, m1 + mova [dstq+xq], m0 + add xq, 16 + jl .v_loop + ret +%endif +%endmacro + +INIT_XMM sse2 +WIENER + +INIT_XMM ssse3 +WIENER + +;;;;;;;;;;;;;;;;;;;;;;;;;; +;; self-guided ;; +;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro GATHERDD 3 ; dst, src, tmp + movd %3d, %2 + %if ARCH_X86_64 + movd %1, [r13+%3] + pextrw %3d, %2, 2 + pinsrw %1, [r13+%3+2], 3 + pextrw %3d, %2, 4 + pinsrw %1, [r13+%3+2], 5 + pextrw %3d, %2, 6 + pinsrw %1, [r13+%3+2], 7 + %else + movd %1, [base+sgr_x_by_x-0xf03+%3] + pextrw %3, %2, 2 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 + pextrw %3, %2, 4 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 + pextrw %3, %2, 6 + pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 + %endif +%endmacro + +%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore + %if ARCH_X86_64 + %define tmp r14 + %else + %define tmp %4 + %endif + GATHERDD %1, %2, tmp + GATHERDD %2, %3, tmp + movif32 %4, %5 + psrld %1, 24 + psrld %2, 24 + packssdw %1, %2 +%endmacro + +%macro MULLD 3 ; dst, src, tmp + pmulhuw %3, %1, %2 + pmullw %1, %2 + pslld %3, 16 + paddd %1, %3 +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 0, 1, 2, 3, 5 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 5*16 + %else + %assign extra_stack 3*16 + %endif +cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \ + dst, stride, left, lpf, w + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*0+4*6] + %define stridemp dword [esp+calloff+16*0+4*7] + %define leftm dword [esp+calloff+16*3+4*0] + %define lpfm dword [esp+calloff+16*3+4*1] + %define w0m dword [esp+calloff+16*3+4*2] + %define hd dword [esp+calloff+16*3+4*3] + %define edgeb byte [esp+calloff+16*3+4*4] + %define edged dword [esp+calloff+16*3+4*4] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t0m dword [esp+calloff+4*2] + %define t2m dword [esp+calloff+4*3] + %define t3m dword [esp+calloff+4*4] + %define t4m dword [esp+calloff+4*5] + %define m8 [base+pb_1] + %define m9 [esp+calloff+16*2] + %define m10 [base+pd_0xf00800a4] + %define m11 [base+sgr_lshuf5] + %define m12 [base+pd_34816] + %define m13 [base+pb_0to15] + %define r10 r4 + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov wd, [rstk+stack_offset+20] + mov dstm, dstq + mov stridemp, strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] + mov lpfm, lpfq + mov hd, r1 + mov edged, r2 + %endif +%else +DECLARE_REG_TMP 8, 7, 9, 11, 12 +cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \ + w, h, edge, params +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + mov wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, r6mp + lea r13, [sgr_x_by_x-0xf03] + movifnidn hd, hm + mov edged, r7m + movu m9, [paramsq] + add lpfq, wq + mova m8, [pb_1] + lea t1, [rsp+wq*2+20] + mova m10, [pd_0xf00800a4] + add dstq, wq + lea t3, [rsp+wq*4+400*12+16] + mova m12, [pd_34816] ; (1 << 11) + (1 << 15) + lea t4, [rsp+wq*2+400*20+16] + pshufhw m7, m9, q0000 + pshufb m9, [pw_256] ; s0 + punpckhqdq m7, m7 ; w0 + neg wq + mova m13, [pb_0to15] + pxor m6, m6 + mova m11, [sgr_lshuf5] + psllw m7, 4 + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+28] ; params + LEA r6, $$ + movu m1, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq*2+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*4+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq*2+400*20+16] + mov t3m, t3 + pshufhw m7, m1, q0000 + mov t4m, t4 + pshufb m1, [base+pw_256] ; s0 + punpckhqdq m7, m7 ; w0 + psllw m7, 4 + neg wq + mova m9, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 2 + mov lpfq, lpfm + mov w0m, wd + %define strideq r5 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, stridemp + movif32 t2m, t1 + mov t2, t1 + call .top_fixup + add t1, 400*6 + call .h_top + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov lpfm, r10 ; below + movif32 t0m, t2 + mov t0, t2 + dec hd + jz .height1 + or edged, 16 + call .h +.main: + add lpfq, stridemp + movif32 t4, t4m + call .hv + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, stridemp +%if ARCH_X86_64 + test hb, hb +%else + mov r4, hd + test r4, r4 +%endif + jz .odd_height + call .h + add lpfq, stridemp + call .hv + movif32 dstq, dstm + call .n0 + call .n1 + sub hd, 2 + movif32 t0, t0m + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .h_top + add lpfq, stridemp + call .hv_bottom +.end: + movif32 dstq, dstm + call .n0 + call .n1 +.end2: + RET +.height1: + movif32 t4, t4m + call .hv + call .prep_n + jmp .odd_height_end +.odd_height: + call .hv + movif32 dstq, dstm + call .n0 + call .n1 +.odd_height_end: + call .v + movif32 dstq, dstm + call .n0 + jmp .end2 +.extend_bottom: + call .v + jmp .end +.no_top: + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h + lea t2, [t1+400*6] + movif32 t2m, t2 + call .top_fixup + dec hd + jz .no_top_height1 + or edged, 16 + mov t0, t1 + mov t1, t2 + movif32 t0m, t0 + jmp .main +.no_top_height1: + movif32 t3, t3m + movif32 t4, t4m + call .v + call .prep_n + jmp .odd_height_end +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movd m1, wd + movd m3, [lpfq-1] + pshufb m1, m6 + pshufb m3, m6 + psubb m2, m8, m1 + pcmpgtb m2, m13 + pand m5, m2 + pandn m2, m3 + por m5, m2 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r4-2] +%else + %define leftq r4 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m4, [leftq-4] + movif32 wq, w0m + mova m5, [lpfq+wq+2] + add leftmp, 4 + palignr m5, m4, 13 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m5, [lpfq+wq+2] + pshufb m5, m11 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r4-2] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m5, [lpfq+wq-1] +.h_main: + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -10 + jl .h_have_right + call .extend_right +.h_have_right: + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + palignr m2, m5, m4, 2 + paddw m0, m4, m2 + palignr m3, m5, m4, 6 + paddw m0, m3 + punpcklwd m1, m2, m3 + pmaddwd m1, m1 + punpckhwd m2, m3 + pmaddwd m2, m2 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m3, m4, m5 + pmaddwd m3, m3 + paddd m1, m3 + punpckhwd m3, m4, m5 + pmaddwd m3, m3 + shufps m4, m5, q2121 + paddw m0, m4 ; sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m2, m3 + test edgeb, 16 ; y > 0 + jz .h_loop_end + paddw m0, [t1+wq*2+400*0] + paddd m1, [t1+wq*2+400*2] + paddd m2, [t1+wq*2+400*4] +.h_loop_end: + paddd m1, m5 ; sumsq + paddd m2, m4 + mova [t1+wq*2+400*0], m0 + mova [t1+wq*2+400*2], m1 + mova [t1+wq*2+400*4], m2 + add wq, 8 + jl .h_loop + ret +.top_fixup: +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov wd, w0m +%endif +.top_fixup_loop: ; the sums of the first row needs to be doubled + mova m0, [t1+wq*2+400*0] + mova m1, [t1+wq*2+400*2] + mova m2, [t1+wq*2+400*4] + paddw m0, m0 + paddd m1, m1 + paddd m2, m2 + mova [t2+wq*2+400*0], m0 + mova [t2+wq*2+400*2], m1 + mova [t2+wq*2+400*4], m2 + add wq, 8 + jl .top_fixup_loop + ret +ALIGN function_align +.hv: ; horizontal boxsum + vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 leftq, leftm + movddup m4, [leftq-4] + movif32 wq, w0m + mova m5, [lpfq+wq+2] + add leftmp, 4 + palignr m5, m4, 13 + jmp .hv_main +.hv_extend_left: + movif32 wq, w0m + mova m5, [lpfq+wq+2] + pshufb m5, m11 + jmp .hv_main +.hv_bottom: +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv_loop_start +%endif +.hv_loop: + movif32 lpfq, hvsrcm +.hv_loop_start: + movu m5, [lpfq+wq-1] +.hv_main: + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv_have_right + cmp wd, -10 + jl .hv_have_right + call .extend_right +.hv_have_right: + movif32 t3, hd + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + palignr m3, m5, m4, 2 + paddw m0, m4, m3 + palignr m1, m5, m4, 6 + paddw m0, m1 + punpcklwd m2, m3, m1 + pmaddwd m2, m2 + punpckhwd m3, m1 + pmaddwd m3, m3 + palignr m5, m4, 8 + paddw m0, m5 + punpcklwd m1, m4, m5 + pmaddwd m1, m1 + paddd m2, m1 + punpckhwd m1, m4, m5 + pmaddwd m1, m1 + shufps m4, m5, q2121 + paddw m0, m4 ; h sum + punpcklwd m5, m4, m6 + pmaddwd m5, m5 + punpckhwd m4, m6 + pmaddwd m4, m4 + paddd m3, m1 + paddd m2, m5 ; h sumsq + paddd m3, m4 + paddw m1, m0, [t1+wq*2+400*0] + paddd m4, m2, [t1+wq*2+400*2] + paddd m5, m3, [t1+wq*2+400*4] +%if ARCH_X86_64 + test hd, hd +%else + test t3, t3 +%endif + jz .hv_last_row +.hv_main2: + paddw m1, [t2+wq*2+400*0] ; hv sum + paddd m4, [t2+wq*2+400*2] ; hv sumsq + paddd m5, [t2+wq*2+400*4] + mova [t0+wq*2+400*0], m0 + pslld m0, m4, 4 + mova [t0+wq*2+400*2], m2 + mova [t0+wq*2+400*4], m3 + pslld m2, m4, 3 + paddd m4, m0 + pslld m0, m5, 4 + paddd m4, m2 ; a * 25 + pslld m2, m5, 3 + paddd m5, m0 + paddd m5, m2 + punpcklwd m0, m1, m6 ; b + punpckhwd m1, m6 + pmaddwd m2, m0, m0 ; b * b + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m2 ; p * s + MULLD m5, m9, m2 + pmaddwd m0, m10 ; b * 164 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, t2, t2m + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m2 + MULLD m1, m5, m2 + paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m12 + mova [t4+wq*2+4], m3 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*4+ 8], m0 + mova [t3+wq*4+24], m1 + add wq, 8 + jl .hv_loop + mov t2, t1 + mov t1, t0 + mov t0, t2 + movif32 t2m, t2 + movif32 t0m, t0 + ret +.hv_last_row: ; esoteric edge case for odd heights + mova [t1+wq*2+400*0], m1 + paddw m1, m0 + mova [t1+wq*2+400*2], m4 + paddd m4, m2 + mova [t1+wq*2+400*4], m5 + paddd m5, m3 + jmp .hv_main2 +.v: ; vertical boxsum + ab +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov wd, w0m +%endif +.v_loop: + mova m0, [t1+wq*2+400*0] + mova m2, [t1+wq*2+400*2] + mova m3, [t1+wq*2+400*4] + paddw m1, m0, [t2+wq*2+400*0] + paddd m4, m2, [t2+wq*2+400*2] + paddd m5, m3, [t2+wq*2+400*4] + paddw m0, m0 + paddd m2, m2 + paddd m3, m3 + paddw m1, m0 ; hv sum + paddd m4, m2 ; hv sumsq + pslld m0, m4, 4 + paddd m5, m3 + pslld m2, m4, 3 + paddd m4, m0 + pslld m0, m5, 4 + paddd m4, m2 ; a * 25 + pslld m2, m5, 3 + paddd m5, m0 + paddd m5, m2 + punpcklwd m0, m1, m6 + punpckhwd m1, m6 + pmaddwd m2, m0, m0 ; b * b + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m2 ; p * s + MULLD m5, m9, m2 + pmaddwd m0, m10 ; b * 164 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, t2, t2m + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m2 + MULLD m1, m5, m2 + paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) + paddd m1, m12 + mova [t4+wq*2+4], m3 + psrld m0, 12 ; b + psrld m1, 12 + mova [t3+wq*4+ 8], m0 + mova [t3+wq*4+24], m1 + add wq, 8 + jl .v_loop + ret +.prep_n: ; initial neighbor setup + movif64 wq, r4 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*2+ 2] + movu m3, [t4+wq*2+ 4] + movu m1, [t3+wq*4+ 4] + movu m4, [t3+wq*4+ 8] + movu m2, [t3+wq*4+20] + movu m5, [t3+wq*4+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*2+ 0] + paddd m4, [t3+wq*4+ 0] + paddd m5, [t3+wq*4+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + mova [t4+wq*2+400*2+ 0], m0 + mova [t3+wq*4+400*4+ 0], m1 + mova [t3+wq*4+400*4+16], m2 + add wq, 8 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r4 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*2+ 2] + movu m3, [t4+wq*2+ 4] + movu m1, [t3+wq*4+ 4] + movu m4, [t3+wq*4+ 8] + movu m2, [t3+wq*4+20] + movu m5, [t3+wq*4+24] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + paddw m3, [t4+wq*2+ 0] + paddd m4, [t3+wq*4+ 0] + paddd m5, [t3+wq*4+16] + paddw m0, m3 + psllw m3, 2 + paddd m1, m4 + pslld m4, 2 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a 565 + paddd m1, m4 ; b 565 + paddd m2, m5 + paddw m3, m0, [t4+wq*2+400*2+ 0] + paddd m4, m1, [t3+wq*4+400*4+ 0] + paddd m5, m2, [t3+wq*4+400*4+16] + mova [t4+wq*2+400*2+ 0], m0 + mova [t3+wq*4+400*4+ 0], m1 + mova [t3+wq*4+400*4+16], m2 + movq m0, [dstq+wq] + punpcklbw m0, m6 + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + packuswb m0, m0 + movq [dstq+wq], m0 + add wq, 8 + jl .n0_loop + add dstq, stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r4 + movif32 wd, w1m +.n1_loop: + movq m0, [dstq+wq] + mova m3, [t4+wq*2+400*2+ 0] + mova m4, [t3+wq*4+400*4+ 0] + mova m5, [t3+wq*4+400*4+16] + punpcklbw m0, m6 + punpcklwd m1, m0, m6 ; src + punpcklwd m2, m3, m6 ; a + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 7) + psubd m5, m3 + psrad m4, 8 + psrad m5, 8 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + packuswb m0, m0 + movq [dstq+wq], m0 + add wq, 8 + jl .n1_loop + add dstq, stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 4*16 + %else + %assign extra_stack 2*16 + %endif +cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \ + dst, stride, left, lpf, w + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*2+4*0] + %define stridemp dword [esp+calloff+16*2+4*1] + %define leftm dword [esp+calloff+16*2+4*2] + %define lpfm dword [esp+calloff+16*2+4*3] + %define w0m dword [esp+calloff+16*2+4*4] + %define hd dword [esp+calloff+16*2+4*5] + %define edgeb byte [esp+calloff+16*2+4*6] + %define edged dword [esp+calloff+16*2+4*6] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %define m8 [base+pb_0to15] + %define m9 [esp+calloff+16*1] + %define m10 [base+pd_0xf00801c7] + %define m11 [base+pd_34816] + %define m12 m6 + %define m13 [base+sgr_lshuf3] + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov wd, [rstk+stack_offset+20] + mov dstm, dstq + mov stridemp, strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] + mov lpfm, lpfq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \ + w, h, edge, params +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + mov wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, r6mp + lea r13, [sgr_x_by_x-0xf03] + mov hd, hm + mov edged, r7m + movq m9, [paramsq+4] + add lpfq, wq + lea t1, [rsp+wq*2+12] + mova m8, [pb_0to15] + add dstq, wq + lea t3, [rsp+wq*4+400*12+8] + mova m10, [pd_0xf00801c7] + lea t4, [rsp+wq*2+400*32+8] + mova m11, [pd_34816] + pshuflw m7, m9, q3333 + pshufb m9, [pw_256] ; s1 + punpcklqdq m7, m7 ; w1 + neg wq + pxor m6, m6 + mova m13, [sgr_lshuf3] + psllw m7, 4 + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+28] ; params + LEA r6, $$ + movq m1, [r1+4] + add lpfm, wq + lea t1, [rsp+extra_stack+wq*2+20] + add dstq, wq + lea t3, [rsp+extra_stack+wq*4+400*12+16] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq*2+400*32+16] + mov t3m, t3 + pshuflw m7, m1, q3333 + mov t4m, t4 + pshufb m1, [base+pw_256] ; s1 + punpcklqdq m7, m7 ; w1 + psllw m7, 4 + neg wq + mova m9, m1 + pxor m6, m6 + mov w1m, wd + sub wd, 2 + mov lpfq, lpfm + mov w0m, wd + %define strideq r5 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, stridemp + mov t2, t1 + add t1, 400*6 + call .h_top + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv0 +%if ARCH_X86_64 + test hb, hb +%else + mov r4, hd + test r4, r4 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*6] +.top_fixup_loop: + mova m0, [t1+wq*2+400*0] + mova m1, [t1+wq*2+400*2] + mova m2, [t1+wq*2+400*4] + mova [t2+wq*2+400*0], m0 + mova [t2+wq*2+400*2], m1 + mova [t2+wq*2+400*4], m2 + add wq, 8 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 + movd m0, [lpfq-1] + movd m1, wd + mova m3, m8 + pshufb m0, m6 + pshufb m1, m6 + mova m2, m6 + psubb m2, m1 + pcmpgtb m2, m3 + pand m5, m2 + pandn m2, m0 + por m5, m2 + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r4-2] +%else + %define leftq r4 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m4, [leftq-4] + movif32 wq, w0m + mova m5, [lpfq+wq+2] + add leftmp, 4 + palignr m5, m4, 14 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m5, [lpfq+wq+2] + pshufb m5, m13 + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r4-2] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m5, [lpfq+wq] +.h_main: + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .h_have_right + cmp wd, -9 + jl .h_have_right + call .extend_right +.h_have_right: + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + mova [t1+wq*2+400*0], m1 + mova [t1+wq*2+400*2], m2 + mova [t1+wq*2+400*4], m3 + add wq, 8 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m4, [leftq-4] + movif32 wq, w0m + mova m5, [lpfq+wq+2] + add leftmp, 4 + palignr m5, m4, 14 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m5, [lpfq+wq+2] + pshufb m5, m13 + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m5, [lpfq+wq] +.hv0_main: + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv0_have_right + cmp wd, -9 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + palignr m0, m5, m4, 2 + paddw m1, m4, m0 + punpcklwd m2, m4, m0 + pmaddwd m2, m2 + punpckhwd m3, m4, m0 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m1, m5 ; sum + punpcklwd m4, m5, m6 + pmaddwd m4, m4 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m4 ; sumsq + paddd m3, m5 + paddw m0, m1, [t1+wq*2+400*0] + paddd m4, m2, [t1+wq*2+400*2] + paddd m5, m3, [t1+wq*2+400*4] + mova [t1+wq*2+400*0], m1 + mova [t1+wq*2+400*2], m2 + mova [t1+wq*2+400*4], m3 + paddw m1, m0, [t2+wq*2+400*0] + paddd m2, m4, [t2+wq*2+400*2] + paddd m3, m5, [t2+wq*2+400*4] + mova [t2+wq*2+400*0], m0 + mova [t2+wq*2+400*2], m4 + mova [t2+wq*2+400*4], m5 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + paddd m5, m3 + punpcklwd m0, m1, m6 ; b + pmaddwd m2, m0, m0 ; b * b + punpckhwd m1, m6 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m12 ; p * s + MULLD m5, m9, m12 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m12 + MULLD m1, m5, m12 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*2+4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*4+ 8], m0 + mova [t3+wq*4+24], m1 + add wq, 8 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m4, [leftq-4] + movif32 wq, w0m + mova m5, [lpfq+wq+2] + add leftmp, 4 + palignr m5, m4, 14 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m5, [lpfq+wq+2] + pshufb m5, m13 + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m5, [lpfq+wq] +.hv1_main: + test edgeb, 2 ; LR_HAVE_RIGHT + jnz .hv1_have_right + cmp wd, -9 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + palignr m1, m5, m4, 2 + paddw m0, m4, m1 + punpcklwd m2, m4, m1 + pmaddwd m2, m2 + punpckhwd m3, m4, m1 + pmaddwd m3, m3 + palignr m5, m4, 4 + paddw m0, m5 ; h sum + punpcklwd m1, m5, m6 + pmaddwd m1, m1 + punpckhwd m5, m6 + pmaddwd m5, m5 + paddd m2, m1 ; h sumsq + paddd m3, m5 + paddw m1, m0, [t2+wq*2+400*0] + paddd m4, m2, [t2+wq*2+400*2] + paddd m5, m3, [t2+wq*2+400*4] + mova [t2+wq*2+400*0], m0 + mova [t2+wq*2+400*2], m2 + mova [t2+wq*2+400*4], m3 + pslld m2, m4, 3 + pslld m3, m5, 3 + paddd m4, m2 ; a * 9 + paddd m5, m3 + punpcklwd m0, m1, m6 ; b + pmaddwd m2, m0, m0 ; b * b + punpckhwd m1, m6 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m12 ; p * s + MULLD m5, m9, m12 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + movif32 t3, t3m + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m12 + MULLD m1, m5, m12 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*2+400*2 +4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*4+400*4+ 8], m0 + mova [t3+wq*4+400*4+24], m1 + add wq, 8 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab (even rows) +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq*2+400*0] + mova m4, [t1+wq*2+400*2] + mova m5, [t1+wq*2+400*4] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq*2+400*0] + paddd m2, m4, [t2+wq*2+400*2] + paddd m3, m5, [t2+wq*2+400*4] + mova [t2+wq*2+400*0], m0 + mova [t2+wq*2+400*2], m4 + mova [t2+wq*2+400*4], m5 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + paddd m5, m3 + punpcklwd m0, m1, m6 ; b + pmaddwd m2, m0, m0 ; b * b + punpckhwd m1, m6 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m12 ; p * s + MULLD m5, m9, m12 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m12 + MULLD m1, m5, m12 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*2+4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*4+ 8], m0 + mova [t3+wq*4+24], m1 + add wq, 8 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov wd, w0m +%endif +.v1_loop: + mova m0, [t1+wq*2+400*0] + mova m4, [t1+wq*2+400*2] + mova m5, [t1+wq*2+400*4] + paddw m1, m0, [t2+wq*2+400*0] + paddd m2, m4, [t2+wq*2+400*2] + paddd m3, m5, [t2+wq*2+400*4] + mova [t2+wq*2+400*0], m0 + mova [t2+wq*2+400*2], m4 + mova [t2+wq*2+400*4], m5 + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a * 9 + paddd m5, m3 + punpcklwd m0, m1, m6 ; b + pmaddwd m2, m0, m0 ; b * b + punpckhwd m1, m6 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p + psubd m5, m3 + MULLD m4, m9, m12 ; p * s + MULLD m5, m9, m12 + pmaddwd m0, m10 ; b * 455 + pmaddwd m1, m10 + paddusw m4, m10 + paddusw m5, m10 + psrld m4, 20 ; min(z, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m12 + MULLD m1, m5, m12 +%if ARCH_X86_32 + pxor m6, m6 +%endif + paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) + paddd m1, m11 + mova [t4+wq*2+400*2+ 4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*4+400*4+ 8], m0 + mova [t3+wq*4+400*4+24], m1 + add wq, 8 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r4 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*2+400*0+ 4] + movu m1, [t3+wq*4+400*0+ 8] + movu m2, [t3+wq*4+400*0+24] + movu m3, [t4+wq*2+400*0+ 2] + movu m4, [t3+wq*4+400*0+ 4] + movu m5, [t3+wq*4+400*0+20] + paddw m0, [t4+wq*2+400*0+ 0] + paddd m1, [t3+wq*4+400*0+ 0] + paddd m2, [t3+wq*4+400*0+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[-1] 444 + pslld m4, 2 ; b[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a[-1] 343 + psubd m4, m1 ; b[-1] 343 + psubd m5, m2 + mova [t4+wq*2+400*4], m3 + mova [t3+wq*4+400*8+ 0], m4 + mova [t3+wq*4+400*8+16], m5 + movu m0, [t4+wq*2+400*2+ 4] + movu m1, [t3+wq*4+400*4+ 8] + movu m2, [t3+wq*4+400*4+24] + movu m3, [t4+wq*2+400*2+ 2] + movu m4, [t3+wq*4+400*4+ 4] + movu m5, [t3+wq*4+400*4+20] + paddw m0, [t4+wq*2+400*2+ 0] + paddd m1, [t3+wq*4+400*4+ 0] + paddd m2, [t3+wq*4+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a[ 0] 444 + pslld m4, 2 ; b[ 0] 444 + pslld m5, 2 + mova [t4+wq*2+400* 6], m3 + mova [t3+wq*4+400*12+ 0], m4 + mova [t3+wq*4+400*12+16], m5 + psubw m3, m0 ; a[ 0] 343 + psubd m4, m1 ; b[ 0] 343 + psubd m5, m2 + mova [t4+wq*2+400* 8], m3 + mova [t3+wq*4+400*16+ 0], m4 + mova [t3+wq*4+400*16+16], m5 + add wq, 8 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r4 + movif32 wd, w1m +.n0_loop: + movu m3, [t4+wq*2+400*0+4] + movu m1, [t4+wq*2+400*0+2] + paddw m3, [t4+wq*2+400*0+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*2+400*4] + paddw m3, [t4+wq*2+400*6] + mova [t4+wq*2+400*4], m2 + mova [t4+wq*2+400*6], m1 + movu m4, [t3+wq*4+400*0+8] + movu m1, [t3+wq*4+400*0+4] + paddd m4, [t3+wq*4+400*0+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*4+400* 8+ 0] + paddd m4, [t3+wq*4+400*12+ 0] + mova [t3+wq*4+400* 8+ 0], m2 + mova [t3+wq*4+400*12+ 0], m1 + movu m5, [t3+wq*4+400*0+24] + movu m1, [t3+wq*4+400*0+20] + paddd m5, [t3+wq*4+400*0+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*4+400* 8+16] + paddd m5, [t3+wq*4+400*12+16] + mova [t3+wq*4+400* 8+16], m2 + mova [t3+wq*4+400*12+16], m1 + movq m0, [dstq+wq] + punpcklbw m0, m6 + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + packuswb m0, m0 + movq [dstq+wq], m0 + add wq, 8 + jl .n0_loop + add dstq, stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r4 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*2+400*2+4] + movu m1, [t4+wq*2+400*2+2] + paddw m3, [t4+wq*2+400*2+0] + paddw m1, m3 + psllw m1, 2 ; a[ 1] 444 + psubw m2, m1, m3 ; a[ 1] 343 + paddw m3, m2, [t4+wq*2+400*6] + paddw m3, [t4+wq*2+400*8] + mova [t4+wq*2+400*6], m1 + mova [t4+wq*2+400*8], m2 + movu m4, [t3+wq*4+400*4+8] + movu m1, [t3+wq*4+400*4+4] + paddd m4, [t3+wq*4+400*4+0] + paddd m1, m4 + pslld m1, 2 ; b[ 1] 444 + psubd m2, m1, m4 ; b[ 1] 343 + paddd m4, m2, [t3+wq*4+400*12+ 0] + paddd m4, [t3+wq*4+400*16+ 0] + mova [t3+wq*4+400*12+ 0], m1 + mova [t3+wq*4+400*16+ 0], m2 + movu m5, [t3+wq*4+400*4+24] + movu m1, [t3+wq*4+400*4+20] + paddd m5, [t3+wq*4+400*4+16] + paddd m1, m5 + pslld m1, 2 + psubd m2, m1, m5 + paddd m5, m2, [t3+wq*4+400*12+16] + paddd m5, [t3+wq*4+400*16+16] + mova [t3+wq*4+400*12+16], m1 + mova [t3+wq*4+400*16+16], m2 + movq m0, [dstq+wq] + punpcklbw m0, m6 + punpcklwd m1, m0, m6 + punpcklwd m2, m3, m6 + pmaddwd m2, m1 ; a * src + punpckhwd m1, m0, m6 + punpckhwd m3, m6 + pmaddwd m3, m1 + psubd m4, m2 ; b - a * src + (1 << 8) + psubd m5, m3 + psrad m4, 9 + psrad m5, 9 + packssdw m4, m5 + pmulhrsw m4, m7 + paddw m0, m4 + packuswb m0, m0 + movq [dstq+wq], m0 + add wq, 8 + jl .n1_loop + add dstq, stridemp + movif32 dstm, dstq + ret + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < 16 + %assign extra_stack 10*16 + %else + %assign extra_stack 8*16 + %endif +cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \ + dst, stride, left, lpf, w + %if STACK_ALIGNMENT < 16 + %define dstm dword [esp+calloff+16*8+4*0] + %define stridemp dword [esp+calloff+16*8+4*1] + %define leftm dword [esp+calloff+16*8+4*2] + %define lpfm dword [esp+calloff+16*8+4*3] + %define w0m dword [esp+calloff+16*8+4*4] + %define hd dword [esp+calloff+16*8+4*5] + %define edgeb byte [esp+calloff+16*8+4*6] + %define edged dword [esp+calloff+16*8+4*6] + %define leftmp leftm + %else + %define w0m wm + %define hd dword r5m + %define edgeb byte r7m + %define edged dword r7m + %endif + %define hvsrcm dword [esp+calloff+4*0] + %define w1m dword [esp+calloff+4*1] + %define t3m dword [esp+calloff+4*2] + %define t4m dword [esp+calloff+4*3] + %xdefine m8 m6 + %define m9 [base+pd_0xffff] + %define m10 [base+pd_34816] + %define m11 [base+pd_0xf00801c7] + %define m12 [base+pd_0xf00800a4] + %define m13 [esp+calloff+16*4] + %define m14 [esp+calloff+16*5] + %define m15 [esp+calloff+16*6] + %define m6 [esp+calloff+16*7] + %define base r6-$$ + %assign calloff 0 + %if STACK_ALIGNMENT < 16 + mov strideq, [rstk+stack_offset+ 8] + mov leftq, [rstk+stack_offset+12] + mov lpfq, [rstk+stack_offset+16] + mov wd, [rstk+stack_offset+20] + mov dstm, dstq + mov stridemp, strideq + mov leftm, leftq + mov r1, [rstk+stack_offset+24] + mov r2, [rstk+stack_offset+32] + mov lpfm, lpfq + mov hd, r1 + mov edged, r2 + %endif +%else +cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ + w, h, edge, params +%endif +%if ARCH_X86_64 || STACK_ALIGNMENT >= 16 + mov wd, wm +%endif +%if ARCH_X86_64 + mov paramsq, r6mp + lea r13, [sgr_x_by_x-0xf03] + movifnidn hd, hm + mov edged, r7m + mova m15, [paramsq] + add lpfq, wq + mova m9, [pd_0xffff] + lea t1, [rsp+wq*2+44] + mova m10, [pd_34816] + add dstq, wq + lea t3, [rsp+wq*4+400*24+40] + mova m11, [pd_0xf00801c7] + lea t4, [rsp+wq*2+400*52+40] + mova m12, [base+pd_0xf00800a4] + neg wq + pshuflw m13, m15, q0000 + pshuflw m14, m15, q2222 + pshufhw m15, m15, q1010 + punpcklqdq m13, m13 ; s0 + punpcklqdq m14, m14 ; s1 + punpckhqdq m15, m15 ; w0 w1 + pxor m6, m6 + psllw m15, 2 + DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w + %define lpfm [rsp] +%else + mov r1, [rstk+stack_offset+28] ; params + LEA r6, $$ + mova m2, [r1] + add lpfm, wq + lea t1, [rsp+extra_stack+wq*2+52] + add dstq, wq + lea t3, [rsp+extra_stack+wq*4+400*24+48] + mov dstm, dstq + lea t4, [rsp+extra_stack+wq*2+400*52+48] + mov t3m, t3 + mov t4m, t4 + neg wq + pshuflw m0, m2, q0000 + pshuflw m1, m2, q2222 + pshufhw m2, m2, q1010 + punpcklqdq m0, m0 ; s0 + punpcklqdq m1, m1 ; s1 + punpckhqdq m2, m2 ; w0 w1 + mov w1m, wd + pxor m3, m3 + psllw m2, 2 + mova m13, m0 + mova m14, m1 + sub wd, 2 + mova m15, m2 + mova m6, m3 + mov lpfq, lpfm + mov w0m, wd + %define strideq r5 +%endif + test edgeb, 4 ; LR_HAVE_TOP + jz .no_top + call .h_top + add lpfq, stridemp + mov t2, t1 +%if ARCH_X86_64 + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup +%else + mov wq, w0m + call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop +%endif + add t1, 400*12 + call .h_top + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + add r10, strideq + mov lpfm, r10 ; below + movif32 t4, t4m + call .hv0 +.main: + dec hd + jz .height1 + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .prep_n + sub hd, 2 + jl .extend_bottom +.main_loop: + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv0 +%if ARCH_X86_64 + test hd, hd +%else + mov r4, hd + test r4, r4 +%endif + jz .odd_height + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1 + call .n0 + call .n1 + sub hd, 2 + jge .main_loop + test edgeb, 8 ; LR_HAVE_BOTTOM + jz .extend_bottom + mov lpfq, lpfm + call .hv0_bottom + movif32 lpfq, hvsrcm + add lpfq, stridemp + call .hv1_bottom +.end: + call .n0 + call .n1 +.end2: + RET +.height1: + call .v1 + call .prep_n + jmp .odd_height_end +.odd_height: + call .v1 + call .n0 + call .n1 +.odd_height_end: + call .v0 + call .v1 + call .n0 + jmp .end2 +.extend_bottom: + call .v0 + call .v1 + jmp .end +.no_top: + movif32 strideq, stridemp + lea r10, [lpfq+strideq*4] + mov lpfq, dstq + lea r10, [r10+strideq*2] + mov lpfm, r10 + call .h +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov wq, w0m + mov hvsrcm, lpfq +%endif + lea t2, [t1+400*12] +.top_fixup_loop: + mova m0, [t1+wq*2+400* 0] + mova m1, [t1+wq*2+400* 2] + mova m2, [t1+wq*2+400* 4] + paddw m0, m0 + mova m3, [t1+wq*2+400* 6] + paddd m1, m1 + mova m4, [t1+wq*2+400* 8] + paddd m2, m2 + mova m5, [t1+wq*2+400*10] + mova [t2+wq*2+400* 0], m0 + mova [t2+wq*2+400* 2], m1 + mova [t2+wq*2+400* 4], m2 + mova [t2+wq*2+400* 6], m3 + mova [t2+wq*2+400* 8], m4 + mova [t2+wq*2+400*10], m5 + add wq, 8 + jl .top_fixup_loop + movif32 t3, t3m + movif32 t4, t4m + call .v0 + jmp .main +.extend_right: +%assign stack_offset stack_offset+8 +%assign calloff 8 +%if ARCH_X86_64 + SWAP m8, m6 +%endif + movd m1, wd + movd m3, [lpfq-1] + pshufb m1, m8 + pshufb m3, m8 + psubb m2, [base+pb_1], m1 + pcmpgtb m2, [base+pb_0to15] + pand m5, m2 + pandn m2, m3 + por m5, m2 +%if ARCH_X86_64 + SWAP m6, m8 +%endif + ret +%assign stack_offset stack_offset-4 +%assign calloff 4 +.h: ; horizontal boxsum +%if ARCH_X86_64 + lea wq, [r4-2] +%else + %define leftq r4 +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 leftq, leftm + movddup m4, [leftq-4] + movif32 wq, w0m + mova m5, [lpfq+wq+2] + add leftmp, 4 + palignr m5, m4, 13 + jmp .h_main +.h_extend_left: + movif32 wq, w0m + mova m5, [lpfq+wq+2] + pshufb m5, [base+sgr_lshuf5] + jmp .h_main +.h_top: +%if ARCH_X86_64 + lea wq, [r4-2] +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .h_extend_left + movif32 wq, w0m +.h_loop: + movu m5, [lpfq+wq-1] +.h_main: + test edgeb, 2 ; LR_HAVE_RIGHT +%if ARCH_X86_32 + pxor m8, m8 +%else + SWAP m8, m6 +%endif + jnz .h_have_right + cmp wd, -10 + jl .h_have_right + call .extend_right +.h_have_right: + punpcklbw m4, m5, m8 + punpckhbw m5, m8 + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; sum3 + punpcklwd m7, m0, m8 + pmaddwd m7, m7 + punpckhwd m0, m8 + pmaddwd m0, m0 +%if ARCH_X86_64 + SWAP m6, m8 +%endif + paddd m2, m7 ; sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + mova [t1+wq*2+400* 6], m1 + mova [t1+wq*2+400* 8], m2 + mova [t1+wq*2+400*10], m3 + paddw m8, m1 ; sum5 + paddd m7, m2 ; sumsq5 + paddd m5, m3 + mova [t1+wq*2+400* 0], m8 + mova [t1+wq*2+400* 2], m7 + mova [t1+wq*2+400* 4], m5 + add wq, 8 + jl .h_loop + ret +ALIGN function_align +.hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 leftq, leftm + movddup m4, [leftq-4] + movif32 wq, w0m + mova m5, [lpfq+wq+2] + add leftmp, 4 + palignr m5, m4, 13 + jmp .hv0_main +.hv0_extend_left: + movif32 wq, w0m + mova m5, [lpfq+wq+2] + pshufb m5, [base+sgr_lshuf5] + jmp .hv0_main +.hv0_bottom: +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv0_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv0_loop_start +%endif +.hv0_loop: + movif32 lpfq, hvsrcm +.hv0_loop_start: + movu m5, [lpfq+wq-1] +.hv0_main: + test edgeb, 2 ; LR_HAVE_RIGHT +%if ARCH_X86_32 + pxor m8, m8 +%else + SWAP m8, m6 +%endif + jnz .hv0_have_right + cmp wd, -10 + jl .hv0_have_right + call .extend_right +.hv0_have_right: + punpcklbw m4, m5, m8 + punpckhbw m5, m8 + palignr m3, m5, m4, 2 + palignr m0, m5, m4, 4 + movif32 t3, t3m + paddw m1, m3, m0 + punpcklwd m2, m3, m0 + pmaddwd m2, m2 + punpckhwd m3, m0 + pmaddwd m3, m3 + palignr m0, m5, m4, 6 + paddw m1, m0 ; h sum3 + punpcklwd m7, m0, m8 + pmaddwd m7, m7 + punpckhwd m0, m8 +%if ARCH_X86_64 + SWAP m6, m8 +%endif + pmaddwd m0, m0 + paddd m2, m7 ; h sumsq3 + palignr m5, m4, 8 + punpcklwd m7, m5, m4 + paddw m8, m4, m5 + pmaddwd m7, m7 + punpckhwd m5, m4 + pmaddwd m5, m5 + paddd m3, m0 + paddw m8, m1 ; h sum5 + paddd m7, m2 ; h sumsq5 + paddd m5, m3 + mova [t3+wq*4+400*8+ 8], m8 + mova [t3+wq*4+400*0+ 8], m7 + mova [t3+wq*4+400*0+24], m5 + paddw m8, [t1+wq*2+400* 0] + paddd m7, [t1+wq*2+400* 2] + paddd m5, [t1+wq*2+400* 4] + mova [t1+wq*2+400* 0], m8 + mova [t1+wq*2+400* 2], m7 + mova [t1+wq*2+400* 4], m5 + paddw m0, m1, [t1+wq*2+400* 6] + paddd m4, m2, [t1+wq*2+400* 8] + paddd m5, m3, [t1+wq*2+400*10] + mova [t1+wq*2+400* 6], m1 + mova [t1+wq*2+400* 8], m2 + mova [t1+wq*2+400*10], m3 + paddw m1, m0, [t2+wq*2+400* 6] + paddd m2, m4, [t2+wq*2+400* 8] + paddd m3, m5, [t2+wq*2+400*10] + mova [t2+wq*2+400* 6], m0 + mova [t2+wq*2+400* 8], m4 + mova [t2+wq*2+400*10], m5 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + paddd m5, m3 + punpcklwd m0, m1, m7 ; b3 + pmaddwd m2, m0, m0 + punpckhwd m1, m7 + pmaddwd m3, m1, m1 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + psubd m4, m2 ; p3 + psubd m5, m3 + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z3, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m7 + MULLD m1, m5, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*2+400*2+ 4], m3 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*4+400*4+ 8], m0 + mova [t3+wq*4+400*4+24], m1 + add wq, 8 + jl .hv0_loop + ret +ALIGN function_align +.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 leftq, leftm + movddup m4, [leftq-4] + movif32 wq, w0m + mova m5, [lpfq+wq+2] + add leftmp, 4 + palignr m5, m4, 13 + jmp .hv1_main +.hv1_extend_left: + movif32 wq, w0m + mova m5, [lpfq+wq+2] + pshufb m5, [base+sgr_lshuf5] + jmp .hv1_main +.hv1_bottom: +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov hvsrcm, lpfq +%endif + test edgeb, 1 ; LR_HAVE_LEFT + jz .hv1_extend_left + movif32 wq, w0m +%if ARCH_X86_32 + jmp .hv1_loop_start +%endif +.hv1_loop: + movif32 lpfq, hvsrcm +.hv1_loop_start: + movu m5, [lpfq+wq-1] +.hv1_main: + test edgeb, 2 ; LR_HAVE_RIGHT +%if ARCH_X86_32 + pxor m8, m8 +%else + SWAP m8, m6 +%endif + jnz .hv1_have_right + cmp wd, -10 + jl .hv1_have_right + call .extend_right +.hv1_have_right: + punpcklbw m4, m5, m8 + punpckhbw m5, m8 + palignr m7, m5, m4, 2 + palignr m3, m5, m4, 4 + paddw m2, m7, m3 + punpcklwd m0, m7, m3 + pmaddwd m0, m0 + punpckhwd m7, m3 + pmaddwd m7, m7 + palignr m3, m5, m4, 6 + paddw m2, m3 ; h sum3 + punpcklwd m1, m3, m8 + pmaddwd m1, m1 + punpckhwd m3, m8 +%if ARCH_X86_64 + SWAP m6, m8 +%endif + pmaddwd m3, m3 + paddd m0, m1 ; h sumsq3 + palignr m5, m4, 8 + punpckhwd m1, m4, m5 + paddw m8, m4, m5 + pmaddwd m1, m1 + punpcklwd m4, m5 + pmaddwd m4, m4 + paddd m7, m3 + paddw m5, m2, [t2+wq*2+400* 6] + mova [t2+wq*2+400* 6], m2 + paddw m8, m2 ; h sum5 + paddd m2, m0, [t2+wq*2+400* 8] + paddd m3, m7, [t2+wq*2+400*10] + mova [t2+wq*2+400* 8], m0 + mova [t2+wq*2+400*10], m7 + paddd m4, m0 ; h sumsq5 + paddd m1, m7 + pslld m0, m2, 3 + pslld m7, m3, 3 + paddd m2, m0 ; a3 * 9 + paddd m3, m7 +%if ARCH_X86_32 + mova [esp+20], m8 + pxor m8, m8 +%else + SWAP m8, m6 +%endif + punpcklwd m0, m5, m8 ; b3 + pmaddwd m7, m0, m0 + punpckhwd m5, m8 + pmaddwd m8, m5, m5 + psubd m2, m7 ; p3 + psubd m3, m8 + MULLD m2, m14, m8 ; p3 * s1 + MULLD m3, m14, m8 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m5, m11 + paddusw m2, m11 + paddusw m3, m11 + psrld m2, 20 ; min(z3, 255) + movif32 t3, t3m + psrld m3, 20 + GATHER_X_BY_X m8, m2, m3, r0, dstm + punpcklwd m2, m8, m8 + punpckhwd m3, m8, m8 + MULLD m0, m2, m7 + MULLD m5, m3, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m5, m10 + psrld m0, 12 + psrld m5, 12 + mova [t4+wq*2+400*4+ 4], m8 + mova [t3+wq*4+400*8+ 8], m0 + mova [t3+wq*4+400*8+24], m5 +%if ARCH_X86_32 + mova m8, [esp+20] +%else + SWAP m6, m8 + pxor m6, m6 +%endif + paddw m5, m8, [t2+wq*2+400*0] + paddd m2, m4, [t2+wq*2+400*2] + paddd m3, m1, [t2+wq*2+400*4] + paddw m5, [t1+wq*2+400*0] + paddd m2, [t1+wq*2+400*2] + paddd m3, [t1+wq*2+400*4] + mova [t2+wq*2+400*0], m8 + pslld m0, m2, 4 + mova [t2+wq*2+400*2], m4 + pslld m8, m3, 4 + mova [t2+wq*2+400*4], m1 + pslld m4, m2, 3 + paddd m2, m0 + pslld m7, m3, 3 + paddd m3, m8 + paddd m2, m4 ; a5 * 25 + paddd m3, m7 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + punpcklwd m0, m5, m7 ; b5 + pmaddwd m4, m0, m0 + punpckhwd m5, m7 + pmaddwd m1, m5, m5 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + psubd m2, m4 ; p5 + psubd m3, m1 + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m12 ; b5 * 164 + pmaddwd m5, m12 + paddusw m2, m12 + paddusw m3, m12 + psrld m2, 20 ; min(z5, 255) + psrld m3, 20 + GATHER_X_BY_X m1, m2, m3, r0, dstm + punpcklwd m2, m1, m1 + punpckhwd m3, m1, m1 + MULLD m0, m2, m7 + MULLD m5, m3, m7 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m5, m10 + mova [t4+wq*2+4], m1 + psrld m0, 12 + psrld m5, 12 + mova [t3+wq*4+ 8], m0 + mova [t3+wq*4+24], m5 + add wq, 8 + jl .hv1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.v0: ; vertical boxsums + ab3 (even rows) +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov wd, w0m +%endif +.v0_loop: + mova m0, [t1+wq*2+400* 6] + mova m4, [t1+wq*2+400* 8] + mova m5, [t1+wq*2+400*10] + paddw m0, m0 + paddd m4, m4 + paddd m5, m5 + paddw m1, m0, [t2+wq*2+400* 6] + paddd m2, m4, [t2+wq*2+400* 8] + paddd m3, m5, [t2+wq*2+400*10] + mova [t2+wq*2+400* 6], m0 + mova [t2+wq*2+400* 8], m4 + mova [t2+wq*2+400*10], m5 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; a3 * 9 + paddd m5, m3 + punpcklwd m0, m1, m7 ; b3 + pmaddwd m2, m0, m0 + punpckhwd m1, m7 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + psubd m5, m3 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z3, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m7 + MULLD m1, m5, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*2+400*2+4], m3 + psrld m0, 12 + psrld m1, 12 + mova m3, [t1+wq*2+400*0] + mova m4, [t1+wq*2+400*2] + mova m5, [t1+wq*2+400*4] + mova [t3+wq*4+400*8+ 8], m3 + mova [t3+wq*4+400*0+ 8], m4 + mova [t3+wq*4+400*0+24], m5 + paddw m3, m3 ; cc5 + paddd m4, m4 + paddd m5, m5 + mova [t1+wq*2+400*0], m3 + mova [t1+wq*2+400*2], m4 + mova [t1+wq*2+400*4], m5 + mova [t3+wq*4+400*4+ 8], m0 + mova [t3+wq*4+400*4+24], m1 + add wq, 8 + jl .v0_loop + ret +.v1: ; vertical boxsums + ab (odd rows) +%if ARCH_X86_64 + lea wq, [r4-2] +%else + mov wd, w0m +%endif +.v1_loop: + mova m4, [t1+wq*2+400* 6] + mova m5, [t1+wq*2+400* 8] + mova m7, [t1+wq*2+400*10] + paddw m1, m4, [t2+wq*2+400* 6] + paddd m2, m5, [t2+wq*2+400* 8] + paddd m3, m7, [t2+wq*2+400*10] + mova [t2+wq*2+400* 6], m4 + mova [t2+wq*2+400* 8], m5 + mova [t2+wq*2+400*10], m7 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + pslld m4, m2, 3 + pslld m5, m3, 3 + paddd m4, m2 ; ((a3 + 8) >> 4) * 9 + paddd m5, m3 + punpcklwd m0, m1, m7 ; b3 + pmaddwd m2, m0, m0 + punpckhwd m1, m7 + pmaddwd m3, m1, m1 + psubd m4, m2 ; p3 + psubd m5, m3 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MULLD m4, m14, m7 ; p3 * s1 + MULLD m5, m14, m7 + pmaddwd m0, m11 ; b3 * 455 + pmaddwd m1, m11 + paddusw m4, m11 + paddusw m5, m11 + psrld m4, 20 ; min(z3, 255) + psrld m5, 20 + GATHER_X_BY_X m3, m4, m5, r0, dstm + punpcklwd m4, m3, m3 + punpckhwd m5, m3, m3 + MULLD m0, m4, m7 + MULLD m1, m5, m7 + paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*2+400*4+4], m3 + psrld m0, 12 + psrld m8, m1, 12 + mova m4, [t3+wq*4+400*8+ 8] + mova m5, [t3+wq*4+400*0+ 8] + mova m7, [t3+wq*4+400*0+24] + paddw m1, m4, [t2+wq*2+400*0] + paddd m2, m5, [t2+wq*2+400*2] + paddd m3, m7, [t2+wq*2+400*4] + paddw m1, [t1+wq*2+400*0] + paddd m2, [t1+wq*2+400*2] + paddd m3, [t1+wq*2+400*4] + mova [t2+wq*2+400*0], m4 + mova [t2+wq*2+400*2], m5 + mova [t2+wq*2+400*4], m7 + pslld m4, m2, 4 + mova [t3+wq*4+400*8+ 8], m0 + pslld m5, m3, 4 + mova [t3+wq*4+400*8+24], m8 + pslld m7, m2, 3 + paddd m2, m4 + pslld m8, m3, 3 + paddd m3, m5 + paddd m2, m7 ; a5 * 25 + paddd m3, m8 +%if ARCH_X86_32 + pxor m7, m7 +%else + SWAP m7, m6 +%endif + punpcklwd m0, m1, m7 ; b5 + pmaddwd m4, m0, m0 + punpckhwd m1, m7 + pmaddwd m5, m1, m1 + psubd m2, m4 ; p5 + psubd m3, m5 +%if ARCH_X86_64 + SWAP m7, m6 +%endif + MULLD m2, m13, m7 ; p5 * s0 + MULLD m3, m13, m7 + pmaddwd m0, m12 ; b5 * 164 + pmaddwd m1, m12 + paddusw m2, m12 + paddusw m3, m12 + psrld m2, 20 ; min(z5, 255) + psrld m3, 20 + GATHER_X_BY_X m4, m2, m3, r0, dstm + punpcklwd m2, m4, m4 + punpckhwd m3, m4, m4 + MULLD m0, m2, m7 + MULLD m1, m3, m7 + paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) + paddd m1, m10 + mova [t4+wq*2+4], m4 + psrld m0, 12 + psrld m1, 12 + mova [t3+wq*4+ 8], m0 + mova [t3+wq*4+24], m1 + add wq, 8 + jl .v1_loop + mov r10, t2 + mov t2, t1 + mov t1, r10 + ret +.prep_n: ; initial neighbor setup + movif64 wq, r4 + movif32 wd, w1m +.prep_n_loop: + movu m0, [t4+wq*2+400*0+ 2] + movu m1, [t3+wq*4+400*0+ 4] + movu m2, [t3+wq*4+400*0+20] + movu m7, [t4+wq*2+400*0+ 4] + movu m8, [t3+wq*4+400*0+ 8] + paddw m3, m0, [t4+wq*2+400*0+ 0] + paddd m4, m1, [t3+wq*4+400*0+ 0] + paddd m5, m2, [t3+wq*4+400*0+16] + paddw m3, m7 + paddd m4, m8 + movu m7, [t3+wq*4+400*0+24] + paddw m0, m3 + paddd m1, m4 + psllw m3, 2 + pslld m4, 2 + paddd m5, m7 + paddd m2, m5 + pslld m5, 2 + paddw m0, m3 ; a5 565 + paddd m1, m4 ; b5 565 + paddd m2, m5 + mova [t4+wq*2+400* 6+ 0], m0 + mova [t3+wq*4+400*12+ 0], m1 + mova [t3+wq*4+400*12+16], m2 + movu m0, [t4+wq*2+400*2+ 4] + movu m1, [t3+wq*4+400*4+ 8] + movu m2, [t3+wq*4+400*4+24] + movu m3, [t4+wq*2+400*2+ 2] + movu m4, [t3+wq*4+400*4+ 4] + movu m5, [t3+wq*4+400*4+20] + paddw m0, [t4+wq*2+400*2+ 0] + paddd m1, [t3+wq*4+400*4+ 0] + paddd m2, [t3+wq*4+400*4+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[-1] 444 + pslld m4, 2 ; b3[-1] 444 + pslld m5, 2 + psubw m3, m0 ; a3[-1] 343 + psubd m4, m1 ; b3[-1] 343 + psubd m5, m2 + mova [t4+wq*2+400* 8+ 0], m3 + mova [t3+wq*4+400*16+ 0], m4 + mova [t3+wq*4+400*16+16], m5 + movu m0, [t4+wq*2+400*4+ 4] + movu m1, [t3+wq*4+400*8+ 8] + movu m2, [t3+wq*4+400*8+24] + movu m3, [t4+wq*2+400*4+ 2] + movu m4, [t3+wq*4+400*8+ 4] + movu m5, [t3+wq*4+400*8+20] + paddw m0, [t4+wq*2+400*4+ 0] + paddd m1, [t3+wq*4+400*8+ 0] + paddd m2, [t3+wq*4+400*8+16] + paddw m3, m0 + paddd m4, m1 + paddd m5, m2 + psllw m3, 2 ; a3[ 0] 444 + pslld m4, 2 ; b3[ 0] 444 + pslld m5, 2 + mova [t4+wq*2+400*10+ 0], m3 + mova [t3+wq*4+400*20+ 0], m4 + mova [t3+wq*4+400*20+16], m5 + psubw m3, m0 ; a3[ 0] 343 + psubd m4, m1 ; b3[ 0] 343 + psubd m5, m2 + mova [t4+wq*2+400*12+ 0], m3 + mova [t3+wq*4+400*24+ 0], m4 + mova [t3+wq*4+400*24+16], m5 + add wq, 8 + jl .prep_n_loop + ret +ALIGN function_align +.n0: ; neighbor + output (even rows) + movif64 wq, r4 + movif32 wd, w1m +.n0_loop: + movu m0, [t4+wq*2+ 4] + movu m2, [t4+wq*2+ 2] + paddw m0, [t4+wq*2+ 0] + paddw m0, m2 + paddw m2, m0 + psllw m0, 2 + paddw m0, m2 ; a5 + movu m4, [t3+wq*4+ 8] + movu m5, [t3+wq*4+24] + movu m1, [t3+wq*4+ 4] + movu m3, [t3+wq*4+20] + paddd m4, [t3+wq*4+ 0] + paddd m5, [t3+wq*4+16] + paddd m4, m1 + paddd m5, m3 + paddd m1, m4 + paddd m3, m5 + pslld m4, 2 + pslld m5, 2 + paddd m4, m1 ; b5 + paddd m5, m3 + movu m2, [t4+wq*2+400* 6] + paddw m2, m0 + mova [t4+wq*2+400* 6], m0 + paddd m0, m4, [t3+wq*4+400*12+ 0] + paddd m1, m5, [t3+wq*4+400*12+16] + mova [t3+wq*4+400*12+ 0], m4 + mova [t3+wq*4+400*12+16], m5 + mova [rsp+16+ARCH_X86_32*4], m1 + movu m3, [t4+wq*2+400*2+4] + movu m5, [t4+wq*2+400*2+2] + paddw m3, [t4+wq*2+400*2+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + movu m3, [t4+wq*2+400* 8] + paddw m3, [t4+wq*2+400*10] + paddw m3, m4 + mova [t4+wq*2+400* 8], m4 + mova [t4+wq*2+400*10], m5 + movu m1, [t3+wq*4+400*4+ 8] + movu m5, [t3+wq*4+400*4+ 4] + movu m7, [t3+wq*4+400*4+24] + movu m8, [t3+wq*4+400*4+20] + paddd m1, [t3+wq*4+400*4+ 0] + paddd m7, [t3+wq*4+400*4+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 +%if ARCH_X86_32 + mova [esp+52], m8 + psubd m8, m7 +%else + psubd m6, m8, m7 + SWAP m8, m6 +%endif + paddd m1, m4, [t3+wq*4+400*16+ 0] + paddd m7, m8, [t3+wq*4+400*16+16] + paddd m1, [t3+wq*4+400*20+ 0] + paddd m7, [t3+wq*4+400*20+16] + mova [t3+wq*4+400*16+ 0], m4 + mova [t3+wq*4+400*16+16], m8 + mova [t3+wq*4+400*20+ 0], m5 +%if ARCH_X86_32 + mova m8, [esp+52] +%else + SWAP m8, m6 + pxor m6, m6 +%endif + mova [t3+wq*4+400*20+16], m8 + mova [rsp+32+ARCH_X86_32*4], m7 + movq m4, [dstq+wq] + punpcklbw m4, m6 + punpcklwd m5, m4, m6 + punpcklwd m7, m2, m6 + pmaddwd m7, m5 ; a5 * src + punpcklwd m8, m3, m6 + pmaddwd m8, m5 ; a3 * src + punpckhwd m5, m4, m6 + punpckhwd m2, m6 + pmaddwd m2, m5 + punpckhwd m3, m6 + pmaddwd m3, m5 + psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13) + psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13) + psrld m0, 9 + pslld m1, 7 + pand m0, m9 + pandn m8, m9, m1 + por m0, m8 + mova m1, [rsp+16+ARCH_X86_32*4] + psubd m1, m2 + mova m2, [rsp+32+ARCH_X86_32*4] + psubd m2, m3 + mova m3, [base+pd_4096] + psrld m1, 9 + pslld m2, 7 + pand m1, m9 + pandn m5, m9, m2 + por m1, m5 + pmaddwd m0, m15 + pmaddwd m1, m15 + paddd m0, m3 + paddd m1, m3 + psrad m0, 13 + psrad m1, 13 + packssdw m0, m1 + paddw m0, m4 + packuswb m0, m0 + movq [dstq+wq], m0 + add wq, 8 + jl .n0_loop + add dstq, stridemp + ret +ALIGN function_align +.n1: ; neighbor + output (odd rows) + movif64 wq, r4 + movif32 wd, w1m +.n1_loop: + movu m3, [t4+wq*2+400*4+4] + movu m5, [t4+wq*2+400*4+2] + paddw m3, [t4+wq*2+400*4+0] + paddw m5, m3 + psllw m5, 2 ; a3[ 1] 444 + psubw m4, m5, m3 ; a3[ 1] 343 + paddw m3, m4, [t4+wq*2+400*12] + paddw m3, [t4+wq*2+400*10] + mova [t4+wq*2+400*10], m5 + mova [t4+wq*2+400*12], m4 + movu m1, [t3+wq*4+400*8+ 8] + movu m5, [t3+wq*4+400*8+ 4] + movu m7, [t3+wq*4+400*8+24] + movu m8, [t3+wq*4+400*8+20] + paddd m1, [t3+wq*4+400*8+ 0] + paddd m7, [t3+wq*4+400*8+16] + paddd m5, m1 + paddd m8, m7 + pslld m5, 2 ; b3[ 1] 444 + pslld m8, 2 + psubd m4, m5, m1 ; b3[ 1] 343 + psubd m0, m8, m7 + paddd m1, m4, [t3+wq*4+400*24+ 0] + paddd m7, m0, [t3+wq*4+400*24+16] + paddd m1, [t3+wq*4+400*20+ 0] + paddd m7, [t3+wq*4+400*20+16] + mova [t3+wq*4+400*20+ 0], m5 + mova [t3+wq*4+400*20+16], m8 + mova [t3+wq*4+400*24+ 0], m4 + mova [t3+wq*4+400*24+16], m0 + movq m5, [dstq+wq] + mova m2, [t4+wq*2+400* 6] + punpcklbw m5, m6 + punpcklwd m4, m5, m6 + punpcklwd m8, m2, m6 + pmaddwd m8, m4 ; a5 * src + punpcklwd m0, m3, m6 + pmaddwd m0, m4 ; a3 * src + punpckhwd m4, m5, m6 + punpckhwd m2, m6 + pmaddwd m2, m4 + punpckhwd m3, m6 + pmaddwd m3, m4 + psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13) + mova m0, [t3+wq*4+400*12+ 0] + psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13) + mova m4, [t3+wq*4+400*12+16] + psubd m4, m2 + psubd m7, m3 + pslld m1, 7 + psrld m0, 8 + psrld m4, 8 + pslld m7, 7 + pandn m3, m9, m1 + pand m0, m9 + por m0, m3 + pand m4, m9 + pandn m2, m9, m7 + por m2, m4 + mova m1, [base+pd_4096] + pmaddwd m0, m15 + pmaddwd m2, m15 + paddd m0, m1 + paddd m2, m1 + psrad m0, 13 + psrad m2, 13 + packssdw m0, m2 + paddw m0, m5 + packuswb m0, m0 + movq [dstq+wq], m0 + add wq, 8 + jl .n1_loop + add dstq, stridemp + movif32 dstm, dstq + ret diff --git a/third_party/dav1d/src/x86/mc.h b/third_party/dav1d/src/x86/mc.h new file mode 100644 index 0000000000..65c607e180 --- /dev/null +++ b/third_party/dav1d/src/x86/mc.h @@ -0,0 +1,299 @@ +/* + * Copyright © 2018-2021, VideoLAN and dav1d authors + * Copyright © 2018-2021, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/mc.h" + +#define decl_fn(type, name) \ + decl_##type##_fn(BF(name, sse2)); \ + decl_##type##_fn(BF(name, ssse3)); \ + decl_##type##_fn(BF(name, avx2)); \ + decl_##type##_fn(BF(name, avx512icl)); +#define init_mc_fn(type, name, suffix) \ + c->mc[type] = BF(dav1d_put_##name, suffix) +#define init_mct_fn(type, name, suffix) \ + c->mct[type] = BF(dav1d_prep_##name, suffix) +#define init_mc_scaled_fn(type, name, suffix) \ + c->mc_scaled[type] = BF(dav1d_put_##name, suffix) +#define init_mct_scaled_fn(type, name, suffix) \ + c->mct_scaled[type] = BF(dav1d_prep_##name, suffix) + +decl_fn(mc, dav1d_put_8tap_regular); +decl_fn(mc, dav1d_put_8tap_regular_smooth); +decl_fn(mc, dav1d_put_8tap_regular_sharp); +decl_fn(mc, dav1d_put_8tap_smooth); +decl_fn(mc, dav1d_put_8tap_smooth_regular); +decl_fn(mc, dav1d_put_8tap_smooth_sharp); +decl_fn(mc, dav1d_put_8tap_sharp); +decl_fn(mc, dav1d_put_8tap_sharp_regular); +decl_fn(mc, dav1d_put_8tap_sharp_smooth); +decl_fn(mc, dav1d_put_bilin); + +decl_fn(mct, dav1d_prep_8tap_regular); +decl_fn(mct, dav1d_prep_8tap_regular_smooth); +decl_fn(mct, dav1d_prep_8tap_regular_sharp); +decl_fn(mct, dav1d_prep_8tap_smooth); +decl_fn(mct, dav1d_prep_8tap_smooth_regular); +decl_fn(mct, dav1d_prep_8tap_smooth_sharp); +decl_fn(mct, dav1d_prep_8tap_sharp); +decl_fn(mct, dav1d_prep_8tap_sharp_regular); +decl_fn(mct, dav1d_prep_8tap_sharp_smooth); +decl_fn(mct, dav1d_prep_bilin); + +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular); +decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth); +decl_fn(mc_scaled, dav1d_put_bilin_scaled); + +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular); +decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth); +decl_fn(mct_scaled, dav1d_prep_bilin_scaled); + +decl_fn(avg, dav1d_avg); +decl_fn(w_avg, dav1d_w_avg); +decl_fn(mask, dav1d_mask); +decl_fn(w_mask, dav1d_w_mask_420); +decl_fn(w_mask, dav1d_w_mask_422); +decl_fn(w_mask, dav1d_w_mask_444); +decl_fn(blend, dav1d_blend); +decl_fn(blend_dir, dav1d_blend_v); +decl_fn(blend_dir, dav1d_blend_h); + +decl_fn(warp8x8, dav1d_warp_affine_8x8); +decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4)); +decl_fn(warp8x8t, dav1d_warp_affine_8x8t); +decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4)); + +decl_fn(emu_edge, dav1d_emu_edge); + +decl_fn(resize, dav1d_resize); + +static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if(!(flags & DAV1D_X86_CPU_FLAG_SSE2)) + return; + +#if BITDEPTH == 8 + init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2); + + c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2); +#endif + + if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) + return; + + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); + + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3); + + c->avg = BF(dav1d_avg, ssse3); + c->w_avg = BF(dav1d_w_avg, ssse3); + c->mask = BF(dav1d_mask, ssse3); + c->w_mask[0] = BF(dav1d_w_mask_444, ssse3); + c->w_mask[1] = BF(dav1d_w_mask_422, ssse3); + c->w_mask[2] = BF(dav1d_w_mask_420, ssse3); + c->blend = BF(dav1d_blend, ssse3); + c->blend_v = BF(dav1d_blend_v, ssse3); + c->blend_h = BF(dav1d_blend_h, ssse3); + c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3); + c->emu_edge = BF(dav1d_emu_edge, ssse3); + c->resize = BF(dav1d_resize, ssse3); + + if(!(flags & DAV1D_X86_CPU_FLAG_SSE41)) + return; + +#if BITDEPTH == 8 + c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4); +#endif + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) + return; + + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); + + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + + c->avg = BF(dav1d_avg, avx2); + c->w_avg = BF(dav1d_w_avg, avx2); + c->mask = BF(dav1d_mask, avx2); + c->w_mask[0] = BF(dav1d_w_mask_444, avx2); + c->w_mask[1] = BF(dav1d_w_mask_422, avx2); + c->w_mask[2] = BF(dav1d_w_mask_420, avx2); + c->blend = BF(dav1d_blend, avx2); + c->blend_v = BF(dav1d_blend_v, avx2); + c->blend_h = BF(dav1d_blend_h, avx2); + c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2); + c->emu_edge = BF(dav1d_emu_edge, avx2); + c->resize = BF(dav1d_resize, avx2); + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) + return; + + init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl); + init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl); + init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl); + + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl); + init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl); + + c->avg = BF(dav1d_avg, avx512icl); + c->w_avg = BF(dav1d_w_avg, avx512icl); + c->mask = BF(dav1d_mask, avx512icl); + c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl); + c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl); + c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl); + c->blend = BF(dav1d_blend, avx512icl); + c->blend_v = BF(dav1d_blend_v, avx512icl); + c->blend_h = BF(dav1d_blend_h, avx512icl); + c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl); + c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl); + c->resize = BF(dav1d_resize, avx512icl); +#endif +} diff --git a/third_party/dav1d/src/x86/mc16_avx2.asm b/third_party/dav1d/src/x86/mc16_avx2.asm new file mode 100644 index 0000000000..61eeaa1007 --- /dev/null +++ b/third_party/dav1d/src/x86/mc16_avx2.asm @@ -0,0 +1,5879 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +; dav1d_obmc_masks[] * -512 +const obmc_masks_avx2 + dw 0, 0, -9728, 0, -12800, -7168, -2560, 0 + dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0 + dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120 + dw -4096, -3072, -2048, -1536, 0, 0, 0, 0 + dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240 + dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608 + dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024 + dw 0, 0, 0, 0, 0, 0, 0, 0 + +deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 +subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 +rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7 +resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 + db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +bdct_lb_q: times 8 db 0 + times 8 db 4 + times 8 db 8 + times 8 db 12 + +prep_mul: dw 16, 16, 4, 4 +put_bilin_h_rnd: dw 8, 8, 10, 10 +put_8tap_h_rnd: dd 34, 40 +s_8tap_h_rnd: dd 2, 8 +s_8tap_h_sh: dd 2, 4 +put_s_8tap_v_rnd: dd 512, 128 +put_s_8tap_v_sh: dd 10, 8 +prep_8tap_1d_rnd: dd 8 - (8192 << 4) +prep_8tap_2d_rnd: dd 32 - (8192 << 5) +warp8x8t_rnd: dd 16384 - (8192 << 15) +warp8x8_shift: dd 5, 3 +warp8x8_rnd: dw 4096, 4096, 16384, 16384 +bidir_rnd: dw -16400, -16400, -16388, -16388 +bidir_mul: dw 2048, 2048, 8192, 8192 + +%define pw_16 prep_mul +%define pd_512 put_s_8tap_v_rnd + +pw_2: times 2 dw 2 +pw_64: times 2 dw 64 +pw_2048: times 2 dw 2048 +pw_8192: times 2 dw 8192 +pw_27615: times 2 dw 27615 +pw_32766: times 2 dw 32766 +pw_m512: times 2 dw -512 +pd_32: dd 32 +pd_63: dd 63 +pd_64: dd 64 +pd_32768: dd 32768 +pd_65538: dd 65538 +pd_m524256: dd -524256 ; -8192 << 6 + 32 +pd_0x3ff: dd 0x3ff +pq_0x40000000: dq 0x40000000 + dd 0 + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128 + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) +%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep) + +BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 + +%macro SCALED_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) +%%table: + %rep %0 - 2 + dw %%base %+ .w%3 - %%base + %rotate 1 + %endrep + %rotate 2 + %%dy_1024: + %xdefine %1_%2_dy1_table (%%dy_1024 - %3) + %rep %0 - 2 + dw %%base %+ .dy1_w%3 - %%base + %rotate 1 + %endrep + %rotate 2 + %%dy_2048: + %xdefine %1_%2_dy2_table (%%dy_2048 - %3) + %rep %0 - 2 + dw %%base %+ .dy2_w%3 - %%base + %rotate 1 + %endrep +%endmacro + +SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +cextern mc_warp_filter +cextern resize_filter + +SECTION .text + +INIT_XMM avx2 +cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy + mov mxyd, r6m ; mx + lea r7, [put_avx2] +%if UNIX64 + DECLARE_REG_TMP 8 + %define org_w r8d + mov r8d, wd +%else + DECLARE_REG_TMP 7 + %define org_w wm +%endif + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx wd, word [r7+wq*2+table_offset(put,)] + add wq, r7 + jmp wq +.put_w2: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +INIT_YMM avx2 +.put_w16: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+ssq*0+32*0] + movu m1, [srcq+ssq*0+32*1] + movu m2, [srcq+ssq*1+32*0] + movu m3, [srcq+ssq*1+32*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+32*0], m0 + mova [dstq+dsq*0+32*1], m1 + mova [dstq+dsq*1+32*0], m2 + mova [dstq+dsq*1+32*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] + movu m2, [srcq+32*2] + movu m3, [srcq+32*3] + add srcq, ssq + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + add dstq, dsq + dec hd + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] + movu m2, [srcq+32*2] + movu m3, [srcq+32*3] + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + movu m0, [srcq+32*4] + movu m1, [srcq+32*5] + movu m2, [srcq+32*6] + movu m3, [srcq+32*7] + add srcq, ssq + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + mova [dstq+32*6], m2 + mova [dstq+32*7], m3 + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + movd xm5, mxyd + mov mxyd, r7m ; my + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] + mov r6d, r8m ; bitdepth_max + add wq, r7 + shr r6d, 11 + vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4] + jmp wq +.h_w2: + movq xm1, [srcq+ssq*0] + movhps xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw xm0, xm4, xm1 + psrlq xm1, 16 + pmullw xm1, xm5 + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 4 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + movq xm1, [srcq+ssq*0+2] + movhps xm1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw xm0, xm4 + pmullw xm1, xm5 + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 4 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + movu xm1, [srcq+ssq*0+2] + vinserti128 m1, [srcq+ssq*1+2], 1 + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + pmullw m0, m4, [srcq+ssq*0] + pmullw m1, m5, [srcq+ssq*0+2] + paddw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+ssq*1] + pmullw m2, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m5, [srcq+32*0+2] + paddw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+32*1] + pmullw m2, m5, [srcq+32*1+2] + add srcq, ssq + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + add dstq, dsq + dec hd + jg .h_w32 + RET +.h_w64: +.h_w128: + movifnidn t0d, org_w +.h_w64_loop0: + mov r6d, t0d +.h_w64_loop: + pmullw m0, m4, [srcq+r6*2-32*1] + pmullw m1, m5, [srcq+r6*2-32*1+2] + paddw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+r6*2-32*2] + pmullw m2, m5, [srcq+r6*2-32*2+2] + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+r6*2-32*1], m0 + mova [dstq+r6*2-32*2], m1 + sub r6d, 32 + jg .h_w64_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w64_loop0 + RET +.v: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] + shl mxyd, 11 + movd xm5, mxyd + add wq, r7 + vpbroadcastw m5, xm5 + jmp wq +.v_w2: + movd xm0, [srcq+ssq*0] +.v_w2_loop: + movd xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpckldq xm2, xm0, xm1 + movd xm0, [srcq+ssq*0] + punpckldq xm1, xm0 + psubw xm1, xm2 + pmulhrsw xm1, xm5 + paddw xm1, xm2 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xm0, [srcq+ssq*0] +.v_w4_loop: + movq xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq xm2, xm0, xm1 + movq xm0, [srcq+ssq*0] + punpcklqdq xm1, xm0 + psubw xm1, xm2 + pmulhrsw xm1, xm5 + paddw xm1, xm2 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu xm0, [srcq+ssq*0] +.v_w8_loop: + vbroadcasti128 m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd m2, m0, m1, 0xf0 + vbroadcasti128 m0, [srcq+ssq*0] + vpblendd m1, m0, 0xf0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w32: + movu m0, [srcq+ssq*0+32*0] + movu m1, [srcq+ssq*0+32*1] +.v_w32_loop: + movu m2, [srcq+ssq*1+32*0] + movu m3, [srcq+ssq*1+32*1] + lea srcq, [srcq+ssq*2] + psubw m4, m2, m0 + pmulhrsw m4, m5 + paddw m4, m0 + movu m0, [srcq+ssq*0+32*0] + mova [dstq+dsq*0+32*0], m4 + psubw m4, m3, m1 + pmulhrsw m4, m5 + paddw m4, m1 + movu m1, [srcq+ssq*0+32*1] + mova [dstq+dsq*0+32*1], m4 + psubw m4, m0, m2 + pmulhrsw m4, m5 + paddw m4, m2 + mova [dstq+dsq*1+32*0], m4 + psubw m4, m1, m3 + pmulhrsw m4, m5 + paddw m4, m3 + mova [dstq+dsq*1+32*1], m4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + RET +.v_w16: +.v_w64: +.v_w128: + movifnidn t0d, org_w + add t0d, t0d + mov r4, srcq + lea r6d, [hq+t0*8-256] + mov r7, dstq +.v_w16_loop0: + movu m0, [srcq+ssq*0] +.v_w16_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m5 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m5 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] + WIN64_SPILL_XMM 8 + shl mxyd, 11 + vpbroadcastd m3, [pw_2] + movd xm6, mxyd + vpbroadcastd m7, [pw_8192] + add wq, r7 + vpbroadcastw m6, xm6 + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + vpbroadcastd m7, [pw_2048] +.hv_12bpc: + jmp wq +.hv_w2: + vpbroadcastq xm1, [srcq+ssq*0] + pmullw xm0, xm4, xm1 + psrlq xm1, 16 + pmullw xm1, xm5 + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 2 +.hv_w2_loop: + movq xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm2, [srcq+ssq*0] + pmullw xm1, xm4, xm2 + psrlq xm2, 16 + pmullw xm2, xm5 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 ; 1 _ 2 _ + shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _ + mova xm0, xm1 + psubw xm1, xm2 + paddw xm1, xm1 + pmulhw xm1, xm6 + paddw xm1, xm2 + pmulhrsw xm1, xm7 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + pmullw xm0, xm4, [srcq+ssq*0-8] + pmullw xm1, xm5, [srcq+ssq*0-6] + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 2 +.hv_w4_loop: + movq xm1, [srcq+ssq*1] + movq xm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps xm1, [srcq+ssq*0] + movhps xm2, [srcq+ssq*0+2] + pmullw xm1, xm4 + pmullw xm2, xm5 + paddw xm1, xm3 + paddw xm1, xm2 + psrlw xm1, 2 ; 1 2 + shufpd xm2, xm0, xm1, 0x01 ; 0 1 + mova xm0, xm1 + psubw xm1, xm2 + paddw xm1, xm1 + pmulhw xm1, xm6 + paddw xm1, xm2 + pmulhrsw xm1, xm7 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + pmullw xm0, xm4, [srcq+ssq*0] + pmullw xm1, xm5, [srcq+ssq*0+2] + paddw xm0, xm3 + paddw xm0, xm1 + psrlw xm0, 2 + vinserti128 m0, xm0, 1 +.hv_w8_loop: + movu xm1, [srcq+ssq*1] + movu xm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + vinserti128 m1, [srcq+ssq*0], 1 + vinserti128 m2, [srcq+ssq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + vperm2i128 m2, m0, m1, 0x21 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +.hv_w32: +.hv_w64: +.hv_w128: +%if UNIX64 + lea r6d, [r8*2-32] +%else + mov r6d, wm + lea r6d, [r6*2-32] +%endif + mov r4, srcq + lea r6d, [hq+r6*8] + mov r7, dstq +.hv_w16_loop0: + pmullw m0, m4, [srcq+ssq*0] + pmullw m1, m5, [srcq+ssq*0+2] + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w16_loop: + pmullw m1, m4, [srcq+ssq*1] + pmullw m2, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 + psubw m2, m1, m0 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m0 + pmulhrsw m2, m7 + mova [dstq+dsq*0], m2 + pmullw m0, m4, [srcq+ssq*0] + pmullw m2, m5, [srcq+ssq*0+2] + paddw m0, m3 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m1 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m1 + pmulhrsw m2, m7 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w16_loop0 + RET + +cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep_avx2] +%if UNIX64 + DECLARE_REG_TMP 7 + %define org_w r7d +%else + DECLARE_REG_TMP 6 + %define org_w r5m +%endif + mov org_w, wd + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [r6+wq*2+table_offset(prep,)] + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [r6-prep_avx2+pw_8192] + add wq, r6 + shr r5d, 11 + vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + vpbroadcastq m1, [srcq+strideq*2] + vpbroadcastq m2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m0, m1, 0x30 + vpblendd m0, m2, 0xc0 + pmullw m0, m4 + psubw m0, m5 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*2] + vinserti128 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m4, [srcq+strideq*2] + pmullw m3, m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmullw m0, m4, [srcq+strideq*0+32*0] + pmullw m1, m4, [srcq+strideq*0+32*1] + pmullw m2, m4, [srcq+strideq*1+32*0] + pmullw m3, m4, [srcq+strideq*1+32*1] + lea srcq, [srcq+strideq*2] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 2 + jg .prep_w32 + RET +.prep_w64: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m4, [srcq+32*1] + pmullw m2, m4, [srcq+32*2] + pmullw m3, m4, [srcq+32*3] + add srcq, strideq + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + dec hd + jg .prep_w64 + RET +.prep_w128: + pmullw m0, m4, [srcq+32*0] + pmullw m1, m4, [srcq+32*1] + pmullw m2, m4, [srcq+32*2] + pmullw m3, m4, [srcq+32*3] + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + pmullw m0, m4, [srcq+32*4] + pmullw m1, m4, [srcq+32*5] + pmullw m2, m4, [srcq+32*6] + pmullw m3, m4, [srcq+32*7] + add tmpq, 32*8 + add srcq, strideq + psubw m0, m5 + psubw m1, m5 + psubw m2, m5 + psubw m3, m5 + mova [tmpq-32*4], m0 + mova [tmpq-32*3], m1 + mova [tmpq-32*2], m2 + mova [tmpq-32*1], m3 + dec hd + jg .prep_w128 + RET +.h: + movd xm5, mxyd + mov mxyd, r6m ; my + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + vpbroadcastd m3, [pw_32766] + psubw m4, m5 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m4, 2 + psllw m5, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + movu xm1, [srcq+strideq*0] + vinserti128 m1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] + vinserti128 m2, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq m0, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + pmullw m0, m4 + vpblendd m1, m2, 0xcc + pmullw m1, m5 + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*0+2] + vinserti128 m1, [srcq+strideq*1+2], 1 + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5 + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m5, [srcq+strideq*0+2] + psubw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m1, m3 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 2 + jg .h_w16 + RET +.h_w32: +.h_w64: +.h_w128: + movifnidn t0d, org_w +.h_w32_loop0: + mov r3d, t0d +.h_w32_loop: + pmullw m0, m4, [srcq+r3*2-32*1] + pmullw m1, m5, [srcq+r3*2-32*1+2] + psubw m0, m3 + paddw m0, m1 + pmullw m1, m4, [srcq+r3*2-32*2] + pmullw m2, m5, [srcq+r3*2-32*2+2] + psubw m1, m3 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+r3*2-32*1], m0 + mova [tmpq+r3*2-32*2], m1 + sub r3d, 32 + jg .h_w32_loop + add srcq, strideq + lea tmpq, [tmpq+t0*2] + dec hd + jg .h_w32_loop0 + RET +.v: + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + movd xm5, mxyd + vpbroadcastd m4, [pw_16] + vpbroadcastw m5, xm5 + vpbroadcastd m3, [pw_32766] + add wq, r6 + lea stride3q, [strideq*3] + psubw m4, m5 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m4, 2 + psllw m5, 2 +.v_12bpc: + jmp wq +.v_w4: + movq xm0, [srcq+strideq*0] +.v_w4_loop: + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq xm1, [srcq+strideq*1] + vpblendd m2, m0, 0x03 ; 0 2 2 2 + vpbroadcastq m0, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m0, 0xf0 ; 1 1 3 3 + vpbroadcastq m0, [srcq+strideq*0] + vpblendd m1, m2, 0x33 ; 0 1 2 3 + vpblendd m0, m2, 0x0c ; 4 2 4 4 + punpckhqdq m2, m1, m0 ; 1 2 3 4 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movu xm0, [srcq+strideq*0] +.v_w8_loop: + vbroadcasti128 m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpblendd m1, m0, m2, 0xf0 ; 0 1 + vbroadcasti128 m0, [srcq+strideq*0] + vpblendd m2, m0, 0xf0 ; 1 2 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5, m2 + psubw m0, m3 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m4 + mova [tmpq+32*0], m1 + pmullw m1, m5, m0 + psubw m2, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + PUSH r7 +%endif + movifnidn r7d, org_w + add r7d, r7d + mov r3, srcq + lea r6d, [hq+r7*8-256] + mov r5, tmpq +.v_w32_loop0: + movu m0, [srcq+strideq*0] +.v_w32_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5, m2 + psubw m0, m3 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m4 + mova [tmpq+r7*0], m1 + pmullw m1, m5, m0 + psubw m2, m3 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+r7*1], m1 + lea tmpq, [tmpq+r7*2] + sub hd, 2 + jg .v_w32_loop + add r3, 32 + add r5, 32 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .v_w32_loop0 +%if WIN64 + POP r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 7 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + movd xm6, mxyd + add wq, r6 + lea stride3q, [strideq*3] + vpbroadcastw m6, xm6 + jmp wq +.hv_w4: + movu xm1, [srcq+strideq*0] +%if WIN64 + movaps [rsp+24], xmm7 +%endif + pmullw xm0, xm4, xm1 + psrldq xm1, 2 + pmullw xm1, xm5 + psubw xm0, xm3 + paddw xm0, xm1 + psraw xm0, 2 + vpbroadcastq m0, xm0 +.hv_w4_loop: + movu xm1, [srcq+strideq*1] + vinserti128 m1, [srcq+stride3q ], 1 + movu xm2, [srcq+strideq*2] + lea srcq, [srcq+strideq*4] + vinserti128 m2, [srcq+strideq*0], 1 + punpcklqdq m7, m1, m2 + psrldq m1, 2 + pslldq m2, 6 + pmullw m7, m4 + vpblendd m1, m2, 0xcc + pmullw m1, m5 + psubw m7, m3 + paddw m1, m7 + psraw m1, 2 ; 1 2 3 4 + vpblendd m0, m1, 0x3f + vpermq m2, m0, q2103 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop +%if WIN64 + movaps xmm7, [rsp+24] +%endif + RET +.hv_w8: + pmullw xm0, xm4, [srcq+strideq*0] + pmullw xm1, xm5, [srcq+strideq*0+2] + psubw xm0, xm3 + paddw xm0, xm1 + psraw xm0, 2 + vinserti128 m0, xm0, 1 +.hv_w8_loop: + movu xm1, [srcq+strideq*1] + movu xm2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + vinserti128 m1, [srcq+strideq*0], 1 + vinserti128 m2, [srcq+strideq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 ; 1 2 + vperm2i128 m2, m0, m1, 0x21 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +.hv_w32: +.hv_w64: +.hv_w128: +%if WIN64 + PUSH r7 +%endif + movifnidn r7d, org_w + add r7d, r7d + mov r3, srcq + lea r6d, [hq+r7*8-256] + mov r5, tmpq +.hv_w16_loop0: + pmullw m0, m4, [srcq] + pmullw m1, m5, [srcq+2] + psubw m0, m3 + paddw m0, m1 + psraw m0, 2 +.hv_w16_loop: + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m1, m3 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+r7*0], m2 + pmullw m0, m4, [srcq+strideq*0] + pmullw m2, m5, [srcq+strideq*0+2] + psubw m0, m3 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+r7*1], m2 + lea tmpq, [tmpq+r7*2] + sub hd, 2 + jg .hv_w16_loop + add r3, 32 + add r5, 32 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .hv_w16_loop0 +%if WIN64 + POP r7 +%endif + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; prefix, type, type_h, type_v +cglobal %1_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) +%endif +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%else +DECLARE_REG_TMP 7, 8 +%endif + +%define PUT_8TAP_FN FN put_8tap, +PUT_8TAP_FN sharp, SHARP, SHARP +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN regular, REGULAR, REGULAR + +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +%define base r8-put_avx2 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movifnidn wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 +%if WIN64 + pop r8 +%endif + jmp wq +.h_w2: + movzx mxd, mxb + sub srcq, 2 + mova xm2, [subpel_h_shuf2] + vpbroadcastd xm3, [base+subpel_filters+mxq*8+2] + pmovsxbw xm3, xm3 +.h_w2_loop: + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm2 + pshufb xm1, xm2 + pmaddwd xm0, xm3 + pmaddwd xm1, xm3 + phaddd xm0, xm1 + paddd xm0, xm4 + psrad xm0, 6 + packusdw xm0, xm0 + pminsw xm0, xm5 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xm3, [base+subpel_filters+mxq*8] + WIN64_SPILL_XMM 8 + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + pshufd xm3, xm3, q2211 + vpbroadcastq m2, xm3 + vpermq m3, m3, q1111 +.h_w4_loop: + movu xm1, [srcq+ssq*0] + vinserti128 m1, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 + pshufb m1, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m2 + pmaddwd m1, m3 + paddd m0, m4 + paddd m0, m1 + psrad m0, 6 + vextracti128 xm1, m0, 1 + packusdw xm0, xm1 + pminsw xm0, xm5 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + mov r7d, r8m + vpbroadcastw m5, r8m + shr r7d, 11 + vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] + cmp wd, 4 + je .h_w4 + jl .h_w2 + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 13 + shr mxd, 16 + sub srcq, 6 + vpbroadcastq m0, [base+subpel_filters+mxq*8] + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 8 + jg .h_w16 +.h_w8: +%macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 + pmaddwd m%5, m9, m%4 ; abcd1 + pmaddwd m%1, m8 ; abcd0 + pshufb m%2, m7 ; 6 7 7 8 8 9 9 a + shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m%5, m4 + paddd m%1, m%5 + pmaddwd m%5, m11, m%2 ; abcd3 + paddd m%1, m%5 + pmaddwd m%5, m10, m%4 ; abcd2 + pshufb m%3, m7 ; a b b c c d d e + pmaddwd m%4, m8 ; efgh0 + paddd m%1, m%5 + pmaddwd m%5, m9, m%2 ; efgh1 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m11 ; efgh3 + pmaddwd m%2, m10 ; efgh2 + paddd m%4, m4 + paddd m%4, m%5 + paddd m%3, m%4 + paddd m%2, m%3 + psrad m%1, 6 + psrad m%2, 6 + packusdw m%1, m%2 + pminsw m%1, m5 +%endmacro + movu xm0, [srcq+ssq*0+ 0] + vinserti128 m0, [srcq+ssq*1+ 0], 1 + movu xm2, [srcq+ssq*0+16] + vinserti128 m2, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + shufpd m1, m0, m2, 0x05 + PUT_8TAP_H 0, 1, 2, 3, 12 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6*2-32] + movu m1, [srcq+r6*2-24] + movu m2, [srcq+r6*2-16] + PUT_8TAP_H 0, 1, 2, 3, 12 + mova [dstq+r6*2-32], m0 + sub r6d, 16 + jg .h_w16_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w16 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m0, [base+subpel_filters+myq*8] + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 15 + vpbroadcastd m6, [pd_32] + vpbroadcastw m7, r8m + lea r6, [ssq*3] + sub srcq, r6 + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 4 + jg .v_w8 + je .v_w4 +.v_w2: + movd xm2, [srcq+ssq*0] + pinsrd xm2, [srcq+ssq*1], 1 + pinsrd xm2, [srcq+ssq*2], 2 + pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3 + lea srcq, [srcq+ssq*4] + movd xm3, [srcq+ssq*0] + vpbroadcastd xm1, [srcq+ssq*1] + vpbroadcastd xm0, [srcq+ssq*2] + add srcq, r6 + vpblendd xm3, xm1, 0x02 ; 4 5 + vpblendd xm1, xm0, 0x02 ; 5 6 + palignr xm4, xm3, xm2, 4 ; 1 2 3 4 + punpcklwd xm3, xm1 ; 45 56 + punpcklwd xm1, xm2, xm4 ; 01 12 + punpckhwd xm2, xm4 ; 23 34 +.v_w2_loop: + vpbroadcastd xm4, [srcq+ssq*0] + pmaddwd xm5, xm8, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm9 ; a1 b1 + paddd xm5, xm6 + paddd xm5, xm2 + mova xm2, xm3 + pmaddwd xm3, xm10 ; a2 b2 + paddd xm5, xm3 + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm4, xm0, 0x02 ; 7 8 + punpcklwd xm3, xm4 ; 67 78 + pmaddwd xm4, xm11, xm3 ; a3 b3 + paddd xm5, xm4 + psrad xm5, 6 + packusdw xm5, xm5 + pminsw xm5, xm7 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xm1, [srcq+ssq*0] + vpbroadcastq m0, [srcq+ssq*1] + vpbroadcastq m2, [srcq+ssq*2] + vpbroadcastq m4, [srcq+r6 ] + lea srcq, [srcq+ssq*4] + vpbroadcastq m3, [srcq+ssq*0] + vpbroadcastq m5, [srcq+ssq*1] + vpblendd m1, m0, 0x30 + vpblendd m0, m2, 0x30 + punpcklwd m1, m0 ; 01 12 + vpbroadcastq m0, [srcq+ssq*2] + add srcq, r6 + vpblendd m2, m4, 0x30 + vpblendd m4, m3, 0x30 + punpcklwd m2, m4 ; 23 34 + vpblendd m3, m5, 0x30 + vpblendd m5, m0, 0x30 + punpcklwd m3, m5 ; 45 56 +.v_w4_loop: + vpbroadcastq m4, [srcq+ssq*0] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m6 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd m4, m0, 0x30 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 6 + vextracti128 xm4, m5, 1 + packusdw xm5, xm4 + pminsw xm5, xm7 + movq [dstq+dsq*0], xm5 + movhps [dstq+dsq*1], xm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + shl wd, 5 + mov r7, srcq + mov r8, dstq + lea wd, [hq+wq-256] +.v_w8_loop0: + vbroadcasti128 m4, [srcq+ssq*0] + vbroadcasti128 m5, [srcq+ssq*1] + vbroadcasti128 m0, [srcq+r6 ] + vbroadcasti128 m6, [srcq+ssq*2] + lea srcq, [srcq+ssq*4] + vbroadcasti128 m1, [srcq+ssq*0] + vbroadcasti128 m2, [srcq+ssq*1] + vbroadcasti128 m3, [srcq+ssq*2] + add srcq, r6 + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + shufpd m6, m2, 0x0c + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + shufpd m0, m3, 0x0c + punpcklwd m3, m6, m0 ; 23 + punpckhwd m6, m0 ; 56 +.v_w8_loop: + vbroadcasti128 m14, [srcq+ssq*0] + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + vbroadcasti128 m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + paddd m13, m6 + shufpd m6, m0, m14, 0x0d + shufpd m0, m14, m5, 0x0c + punpcklwd m5, m6, m0 ; 67 + punpckhwd m6, m0 ; 78 + pmaddwd m14, m11, m5 ; a3 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 5 + psrad m13, 5 + packusdw m12, m13 + pxor m13, m13 + pavgw m12, m13 + pminsw m12, m7 + vpermq m12, m12, q3120 + mova [dstq+dsq*0], xm12 + vextracti128 [dstq+dsq*1], m12, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + add r7, 16 + add r8, 16 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .v_w8_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + vpbroadcastw m15, r8m + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m1, [base+subpel_filters+myq*8] + vpbroadcastd m6, [pd_512] + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_10bit + psraw m7, 2 + psllw m1, 2 +.hv_10bit: + pshufd m11, m1, q0000 + pshufd m12, m1, q1111 + pshufd m13, m1, q2222 + pshufd m14, m1, q3333 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m9, [subpel_h_shuf2] + vbroadcasti128 m1, [srcq+r6 ] ; 3 3 + movu xm3, [srcq+ssq*2] + movu xm0, [srcq+ssq*0] + movu xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*4] + vinserti128 m3, [srcq+ssq*0], 1 ; 2 4 + vinserti128 m0, [srcq+ssq*1], 1 ; 0 5 + vinserti128 m2, [srcq+ssq*2], 1 ; 1 6 + add srcq, r6 + pshufb m1, m9 + pshufb m3, m9 + pshufb m0, m9 + pshufb m2, m9 + pmaddwd m1, m7 + pmaddwd m3, m7 + pmaddwd m0, m7 + pmaddwd m2, m7 + phaddd m1, m3 + phaddd m0, m2 + paddd m1, m6 + paddd m0, m6 + psrad m1, 10 + psrad m0, 10 + packssdw m1, m0 ; 3 2 0 1 + vextracti128 xm0, m1, 1 ; 3 4 5 6 + pshufd xm2, xm1, q1301 ; 2 3 1 2 + pshufd xm3, xm0, q2121 ; 4 5 4 5 + punpckhwd xm1, xm2 ; 01 12 + punpcklwd xm2, xm0 ; 23 34 + punpckhwd xm3, xm0 ; 45 56 +.hv_w2_loop: + movu xm4, [srcq+ssq*0] + movu xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm4, xm9 + pshufb xm5, xm9 + pmaddwd xm4, xm7 + pmaddwd xm5, xm7 + phaddd xm4, xm5 + pmaddwd xm5, xm11, xm1 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm12 ; a1 b1 + paddd xm5, xm2 + mova xm2, xm3 + pmaddwd xm3, xm13 ; a2 b2 + paddd xm5, xm3 + paddd xm4, xm6 + psrad xm4, 10 + packssdw xm4, xm4 + palignr xm3, xm4, xm0, 12 + mova xm0, xm4 + punpcklwd xm3, xm0 ; 67 78 + pmaddwd xm4, xm14, xm3 ; a3 b3 + paddd xm5, xm6 + paddd xm5, xm4 + psrad xm5, 10 + packusdw xm5, xm5 + pminsw xm5, xm15 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + vbroadcasti128 m9, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + pshufd m8, m7, q1111 + pshufd m7, m7, q0000 + movu xm1, [srcq+ssq*0] + vinserti128 m1, [srcq+ssq*1], 1 ; 0 1 + vbroadcasti128 m0, [srcq+r6 ] + vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3 + lea srcq, [srcq+ssq*4] + vinserti128 m0, [srcq+ssq*0], 1 ; 3 4 + movu xm3, [srcq+ssq*1] + vinserti128 m3, [srcq+ssq*2], 1 ; 5 6 + add srcq, r6 + pshufb m4, m1, m9 + pshufb m1, m10 + pmaddwd m4, m7 + pmaddwd m1, m8 + pshufb m5, m2, m9 + pshufb m2, m10 + pmaddwd m5, m7 + pmaddwd m2, m8 + paddd m4, m6 + paddd m1, m4 + pshufb m4, m0, m9 + pshufb m0, m10 + pmaddwd m4, m7 + pmaddwd m0, m8 + paddd m5, m6 + paddd m2, m5 + pshufb m5, m3, m9 + pshufb m3, m10 + pmaddwd m5, m7 + pmaddwd m3, m8 + paddd m4, m6 + paddd m4, m0 + paddd m5, m6 + paddd m5, m3 + vperm2i128 m0, m1, m2, 0x21 + psrld m1, 10 + psrld m2, 10 + vperm2i128 m3, m4, m5, 0x21 + pslld m4, 6 + pslld m5, 6 + pblendw m2, m4, 0xaa ; 23 34 + pslld m0, 6 + pblendw m1, m0, 0xaa ; 01 12 + psrld m3, 10 + pblendw m3, m5, 0xaa ; 45 56 + psrad m0, m5, 16 +.hv_w4_loop: + movu xm4, [srcq+ssq*0] + vinserti128 m4, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m6 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + pshufb m3, m4, m9 + pshufb m4, m10 + pmaddwd m3, m7 + pmaddwd m4, m8 + paddd m3, m6 + paddd m4, m3 + psrad m4, 10 + packssdw m0, m4 ; _ 7 6 8 + vpermq m3, m0, q1122 ; _ 6 _ 7 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m4, m5 + psrad m4, 10 + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, xm15 + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + pmovsxbw xm1, [base+subpel_filters+myq*8] + shl wd, 5 + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + pxor m0, m0 + punpcklbw m0, m2 + mov r7, srcq + mov r8, dstq + lea wd, [hq+wq-256] + test dword r8m, 0x800 + jz .hv_w8_10bit + psraw m0, 2 + psllw xm1, 2 +.hv_w8_10bit: + pshufd m11, m0, q0000 + pshufd m12, m0, q1111 + pshufd m13, m0, q2222 + pshufd m14, m0, q3333 +%if WIN64 + %define v_mul (rsp+stack_offset+40) ; r4m +%else + %define v_mul (rsp-24) ; red zone +%endif + mova [v_mul], xm1 +.hv_w8_loop0: +%macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 + pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 + pmaddwd m3, m12, m2 + pmaddwd m%1, m11 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m3, m10 + paddd m%1, m3 + pmaddwd m3, m14, m%2 + paddd m%1, m3 + pmaddwd m3, m13, m2 + pshufb m%3, m9 ; a b b c c d d e + pmaddwd m2, m11 + paddd m%1, m3 + pmaddwd m3, m12, m%2 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m14 + pmaddwd m%2, m13 + paddd m2, m10 + paddd m2, m3 + paddd m%3, m2 + paddd m%2, m%3 + psrad m%1, 10 + psrad m%2, 10 + packssdw m%1, m%2 +%endmacro + movu xm4, [srcq+r6 *1+ 0] + vbroadcasti128 m8, [subpel_h_shufA] + movu xm6, [srcq+r6 *1+ 8] + vbroadcasti128 m9, [subpel_h_shufB] + movu xm0, [srcq+r6 *1+16] + vpbroadcastd m10, [pd_512] + movu xm5, [srcq+ssq*0+ 0] + vinserti128 m5, [srcq+ssq*4+ 0], 1 + movu xm1, [srcq+ssq*0+16] + vinserti128 m1, [srcq+ssq*4+16], 1 + shufpd m7, m5, m1, 0x05 + INIT_XMM avx2 + PUT_8TAP_HV_H 4, 6, 0 ; 3 + INIT_YMM avx2 + PUT_8TAP_HV_H 5, 7, 1 ; 0 4 + movu xm0, [srcq+ssq*2+ 0] + vinserti128 m0, [srcq+r6 *2+ 0], 1 + movu xm1, [srcq+ssq*2+16] + vinserti128 m1, [srcq+r6 *2+16], 1 + shufpd m7, m0, m1, 0x05 + PUT_8TAP_HV_H 0, 7, 1 ; 2 6 + movu xm6, [srcq+ssq*1+ 0] + movu xm1, [srcq+ssq*1+16] + lea srcq, [srcq+ssq*4] + vinserti128 m6, [srcq+ssq*1+ 0], 1 + vinserti128 m1, [srcq+ssq*1+16], 1 + add srcq, r6 + shufpd m7, m6, m1, 0x05 + PUT_8TAP_HV_H 6, 7, 1 ; 1 5 + vpermq m4, m4, q1100 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + vpermq m7, m0, q3120 + punpcklwd m3, m7, m4 ; 23 + punpckhwd m4, m5 ; 34 + punpcklwd m1, m5, m6 ; 01 + punpckhwd m5, m6 ; 45 + punpcklwd m2, m6, m7 ; 12 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vpbroadcastd m9, [v_mul+4*0] + vpbroadcastd m7, [v_mul+4*1] + vpbroadcastd m10, [v_mul+4*2] + pmaddwd m8, m9, m1 ; a0 + pmaddwd m9, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m7 ; a1 + pmaddwd m4, m7 ; b1 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m8, m5 + paddd m9, m6 + movu xm5, [srcq+ssq*0] + vinserti128 m5, [srcq+ssq*1], 1 + vbroadcasti128 m7, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + movu xm6, [srcq+ssq*0+16] + vinserti128 m6, [srcq+ssq*1+16], 1 + vextracti128 [dstq], m0, 1 + pshufb m0, m5, m7 ; 01 + pshufb m5, m10 ; 23 + pmaddwd m0, m11 + pmaddwd m5, m12 + paddd m0, m5 + pshufb m5, m6, m7 ; 89 + pshufb m6, m10 ; ab + pmaddwd m5, m13 + pmaddwd m6, m14 + paddd m6, m5 + movu xm5, [srcq+ssq*0+8] + vinserti128 m5, [srcq+ssq*1+8], 1 + lea srcq, [srcq+ssq*2] + pshufb m7, m5, m7 + pshufb m5, m10 + pmaddwd m10, m13, m7 + pmaddwd m7, m11 + paddd m0, m10 + vpbroadcastd m10, [pd_512] + paddd m6, m7 + pmaddwd m7, m14, m5 + pmaddwd m5, m12 + paddd m0, m7 + paddd m5, m6 + vbroadcasti128 m6, [dstq] + paddd m8, m10 + paddd m9, m10 + paddd m0, m10 + paddd m5, m10 + vpbroadcastd m10, [v_mul+4*3] + psrad m0, 10 + psrad m5, 10 + packssdw m0, m5 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m10, m5 ; a3 + pmaddwd m10, m6 ; b3 + paddd m7, m8 + paddd m9, m10 + psrad m7, 10 + psrad m9, 10 + packusdw m7, m9 + pminsw m7, m15 + vpermq m7, m7, q3120 + mova [dstq+dsq*0], xm7 + vextracti128 [dstq+dsq*1], m7, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add r7, 16 + add r8, 16 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .hv_w8_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my +%define base r7-prep_avx2 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep_avx2] + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov r6d, r7m ; bitdepth_max + movzx wd, word [r7+wq*2+table_offset(prep,)] + vpbroadcastd m5, [r7-prep_avx2+pw_8192] + shr r6d, 11 + add wq, r7 + vpbroadcastd m4, [base+prep_mul+r6*4] + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xm0, [base+subpel_filters+mxq*8] + vbroadcasti128 m3, [subpel_h_shufA] + vbroadcasti128 m4, [subpel_h_shufB] + WIN64_SPILL_XMM 8 + pshufd xm0, xm0, q2211 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw xm0, 2 +.h_w4_12bpc: + vpbroadcastq m6, xm0 + vpermq m7, m0, q1111 +.h_w4_loop: + movu xm1, [srcq+strideq*0] + vinserti128 m1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] + vinserti128 m2, [srcq+r6 ], 1 + lea srcq, [srcq+strideq*4] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) + lea r6, [strideq*3] + cmp wd, 4 + je .h_w4 + shr mxd, 16 + sub srcq, 6 + vpbroadcastq m0, [base+subpel_filters+mxq*8] + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 + vbroadcasti128 m6, [subpel_h_shufA] + vbroadcasti128 m7, [subpel_h_shufB] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .h_12bpc + psllw m0, 2 +.h_12bpc: + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 8 + jg .h_w16 +.h_w8: +%macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] + pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 + pmaddwd m%5, m9, m%4 ; abcd1 + pmaddwd m%1, m8 ; abcd0 + pshufb m%2, m7 ; 6 7 7 8 8 9 9 a + shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m%5, m5 + paddd m%1, m%5 + pmaddwd m%5, m11, m%2 ; abcd3 + paddd m%1, m%5 + pmaddwd m%5, m10, m%4 ; abcd2 + pshufb m%3, m7 ; a b b c c d d e + pmaddwd m%4, m8 ; efgh0 + paddd m%1, m%5 + pmaddwd m%5, m9, m%2 ; efgh1 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m11 ; efgh3 + pmaddwd m%2, m10 ; efgh2 + paddd m%4, m5 + paddd m%4, m%5 + paddd m%3, m%4 + paddd m%2, m%3 + psrad m%1, 4 + psrad m%2, 4 + packssdw m%1, m%2 +%endmacro + movu xm0, [srcq+strideq*0+ 0] + vinserti128 m0, [srcq+strideq*1+ 0], 1 + movu xm2, [srcq+strideq*0+16] + vinserti128 m2, [srcq+strideq*1+16], 1 + lea srcq, [srcq+strideq*2] + shufpd m1, m0, m2, 0x05 + PREP_8TAP_H 0, 1, 2, 3, 4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + add wd, wd +.h_w16_loop0: + mov r6d, wd +.h_w16_loop: + movu m0, [srcq+r6-32] + movu m1, [srcq+r6-24] + movu m2, [srcq+r6-16] + PREP_8TAP_H 0, 1, 2, 3, 4 + mova [tmpq+r6-32], m0 + sub r6d, 32 + jg .h_w16_loop + add srcq, strideq + add tmpq, wq + dec hd + jg .h_w16_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m0, [base+subpel_filters+myq*8] + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 15 + vpbroadcastd m7, [prep_8tap_1d_rnd] + lea r6, [strideq*3] + sub srcq, r6 + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m0, 2 +.v_12bpc: + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + cmp wd, 4 + jg .v_w8 +.v_w4: + movq xm1, [srcq+strideq*0] + vpbroadcastq m0, [srcq+strideq*1] + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq m4, [srcq+r6 ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + vpblendd m1, m0, 0x30 + vpblendd m0, m2, 0x30 + punpcklwd m1, m0 ; 01 12 + vpbroadcastq m0, [srcq+strideq*2] + add srcq, r6 + vpblendd m2, m4, 0x30 + vpblendd m4, m3, 0x30 + punpcklwd m2, m4 ; 23 34 + vpblendd m3, m5, 0x30 + vpblendd m5, m0, 0x30 + punpcklwd m3, m5 ; 45 56 +.v_w4_loop: + vpbroadcastq m4, [srcq+strideq*0] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m7 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpblendd m4, m0, 0x30 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 4 + vextracti128 xm4, m5, 1 + packssdw xm5, xm4 + mova [tmpq], xm5 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: +%if WIN64 + push r8 +%endif + mov r8d, wd + shl wd, 5 + mov r5, srcq + mov r7, tmpq + lea wd, [hq+wq-256] +.v_w8_loop0: + vbroadcasti128 m4, [srcq+strideq*0] + vbroadcasti128 m5, [srcq+strideq*1] + vbroadcasti128 m0, [srcq+r6 ] + vbroadcasti128 m6, [srcq+strideq*2] + lea srcq, [srcq+strideq*4] + vbroadcasti128 m1, [srcq+strideq*0] + vbroadcasti128 m2, [srcq+strideq*1] + vbroadcasti128 m3, [srcq+strideq*2] + add srcq, r6 + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + shufpd m6, m2, 0x0c + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + shufpd m0, m3, 0x0c + punpcklwd m3, m6, m0 ; 23 + punpckhwd m6, m0 ; 56 +.v_w8_loop: + vbroadcasti128 m14, [srcq+strideq*0] + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m7 + paddd m13, m7 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + vbroadcasti128 m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + paddd m13, m6 + shufpd m6, m0, m14, 0x0d + shufpd m0, m14, m5, 0x0c + punpcklwd m5, m6, m0 ; 67 + punpckhwd m6, m0 ; 78 + pmaddwd m14, m11, m5 ; a3 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 4 + psrad m13, 4 + packssdw m12, m13 + vpermq m12, m12, q3120 + mova [tmpq+r8*0], xm12 + vextracti128 [tmpq+r8*2], m12, 1 + lea tmpq, [tmpq+r8*4] + sub hd, 2 + jg .v_w8_loop + add r5, 16 + add r7, 16 + movzx hd, wb + mov srcq, r5 + mov tmpq, r7 + sub wd, 1<<8 + jg .v_w8_loop0 +%if WIN64 + pop r8 +%endif + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + vpbroadcastd m15, [prep_8tap_2d_rnd] + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + vpbroadcastd m0, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + vpbroadcastq m1, [base+subpel_filters+myq*8] + lea r6, [strideq*3] + sub srcq, 2 + sub srcq, r6 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m1, m1 + psraw m7, 4 + psraw m1, 8 + test dword r7m, 0x800 + jz .hv_w4_10bit + psraw m7, 2 +.hv_w4_10bit: + pshufd m11, m1, q0000 + pshufd m12, m1, q1111 + pshufd m13, m1, q2222 + pshufd m14, m1, q3333 +.hv_w4: + vbroadcasti128 m9, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + pshufd m8, m7, q1111 + pshufd m7, m7, q0000 + movu xm1, [srcq+strideq*0] + vinserti128 m1, [srcq+strideq*1], 1 ; 0 1 + vbroadcasti128 m0, [srcq+r6 ] + vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3 + lea srcq, [srcq+strideq*4] + vinserti128 m0, [srcq+strideq*0], 1 ; 3 4 + movu xm3, [srcq+strideq*1] + vinserti128 m3, [srcq+strideq*2], 1 ; 5 6 + add srcq, r6 + pshufb m4, m1, m9 + pshufb m1, m10 + pmaddwd m4, m7 + pmaddwd m1, m8 + pshufb m5, m2, m9 + pshufb m2, m10 + pmaddwd m5, m7 + pmaddwd m2, m8 + paddd m4, m15 + paddd m1, m4 + pshufb m4, m0, m9 + pshufb m0, m10 + pmaddwd m4, m7 + pmaddwd m0, m8 + paddd m5, m15 + paddd m2, m5 + pshufb m5, m3, m9 + pshufb m3, m10 + pmaddwd m5, m7 + pmaddwd m3, m8 + paddd m4, m15 + paddd m4, m0 + paddd m5, m15 + paddd m5, m3 + vperm2i128 m0, m1, m2, 0x21 + psrld m1, 6 + psrld m2, 6 + vperm2i128 m3, m4, m5, 0x21 + pslld m4, 10 + pslld m5, 10 + pblendw m2, m4, 0xaa ; 23 34 + pslld m0, 10 + pblendw m1, m0, 0xaa ; 01 12 + psrld m3, 6 + pblendw m3, m5, 0xaa ; 45 56 + psrad m0, m5, 16 +.hv_w4_loop: + movu xm4, [srcq+strideq*0] + vinserti128 m4, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m15 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + pshufb m3, m4, m9 + pshufb m4, m10 + pmaddwd m3, m7 + pmaddwd m4, m8 + paddd m3, m15 + paddd m4, m3 + psrad m4, 6 + packssdw m0, m4 ; _ 7 6 8 + vpermq m3, m0, q1122 ; _ 6 _ 7 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m4, m5 + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + vpbroadcastq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmovle myd, mxd + pmovsxbw xm1, [base+subpel_filters+myq*8] +%if WIN64 + PUSH r8 +%endif + mov r8d, wd + shl wd, 5 + lea r6, [strideq*3] + sub srcq, 6 + sub srcq, r6 + mov r5, srcq + mov r7, tmpq + lea wd, [hq+wq-256] + pxor m0, m0 + punpcklbw m0, m2 + mova [v_mul], xm1 + psraw m0, 4 + test dword r7m, 0x800 + jz .hv_w8_10bit + psraw m0, 2 +.hv_w8_10bit: + pshufd m11, m0, q0000 + pshufd m12, m0, q1111 + pshufd m13, m0, q2222 + pshufd m14, m0, q3333 +.hv_w8_loop0: +%macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 + pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 + pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 + pmaddwd m3, m12, m2 + pmaddwd m%1, m11 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 + paddd m3, m15 + paddd m%1, m3 + pmaddwd m3, m14, m%2 + paddd m%1, m3 + pmaddwd m3, m13, m2 + pshufb m%3, m9 ; a b b c c d d e + pmaddwd m2, m11 + paddd m%1, m3 + pmaddwd m3, m12, m%2 + shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c + pmaddwd m%3, m14 + pmaddwd m%2, m13 + paddd m2, m15 + paddd m2, m3 + paddd m2, m%3 + paddd m2, m%2 + psrad m%1, 6 + psrad m2, 6 + packssdw m%1, m2 +%endmacro + movu xm4, [srcq+r6 + 0] + vbroadcasti128 m8, [subpel_h_shufA] + movu xm6, [srcq+r6 + 8] + vbroadcasti128 m9, [subpel_h_shufB] + movu xm0, [srcq+r6 +16] + movu xm5, [srcq+strideq*0+ 0] + vinserti128 m5, [srcq+strideq*4+ 0], 1 + movu xm1, [srcq+strideq*0+16] + vinserti128 m1, [srcq+strideq*4+16], 1 + shufpd m7, m5, m1, 0x05 + INIT_XMM avx2 + PREP_8TAP_HV_H 4, 6, 0 ; 3 + INIT_YMM avx2 + PREP_8TAP_HV_H 5, 7, 1 ; 0 4 + movu xm0, [srcq+strideq*2+ 0] + vinserti128 m0, [srcq+r6 *2+ 0], 1 + movu xm1, [srcq+strideq*2+16] + vinserti128 m1, [srcq+r6 *2+16], 1 + shufpd m7, m0, m1, 0x05 + PREP_8TAP_HV_H 0, 7, 1 ; 2 6 + movu xm6, [srcq+strideq*1+ 0] + movu xm1, [srcq+strideq*1+16] + lea srcq, [srcq+strideq*4] + vinserti128 m6, [srcq+strideq*1+ 0], 1 + vinserti128 m1, [srcq+strideq*1+16], 1 + add srcq, r6 + shufpd m7, m6, m1, 0x05 + PREP_8TAP_HV_H 6, 7, 1 ; 1 5 + vpermq m4, m4, q1100 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + vpermq m7, m0, q3120 + punpcklwd m3, m7, m4 ; 23 + punpckhwd m4, m5 ; 34 + punpcklwd m1, m5, m6 ; 01 + punpckhwd m5, m6 ; 45 + punpcklwd m2, m6, m7 ; 12 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vpbroadcastd m9, [v_mul+4*0] + vpbroadcastd m7, [v_mul+4*1] + vpbroadcastd m10, [v_mul+4*2] + pmaddwd m8, m9, m1 ; a0 + pmaddwd m9, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m7 ; a1 + pmaddwd m4, m7 ; b1 + paddd m8, m15 + paddd m9, m15 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m8, m5 + paddd m9, m6 + movu xm5, [srcq+strideq*0] + vinserti128 m5, [srcq+strideq*1], 1 + vbroadcasti128 m7, [subpel_h_shufA] + vbroadcasti128 m10, [subpel_h_shufB] + movu xm6, [srcq+strideq*0+16] + vinserti128 m6, [srcq+strideq*1+16], 1 + vextracti128 [tmpq], m0, 1 + pshufb m0, m5, m7 ; 01 + pshufb m5, m10 ; 23 + pmaddwd m0, m11 + pmaddwd m5, m12 + paddd m0, m15 + paddd m0, m5 + pshufb m5, m6, m7 ; 89 + pshufb m6, m10 ; ab + pmaddwd m5, m13 + pmaddwd m6, m14 + paddd m5, m15 + paddd m6, m5 + movu xm5, [srcq+strideq*0+8] + vinserti128 m5, [srcq+strideq*1+8], 1 + lea srcq, [srcq+strideq*2] + pshufb m7, m5, m7 + pshufb m5, m10 + pmaddwd m10, m13, m7 + pmaddwd m7, m11 + paddd m0, m10 + paddd m6, m7 + pmaddwd m7, m14, m5 + pmaddwd m5, m12 + paddd m0, m7 + paddd m5, m6 + vbroadcasti128 m6, [tmpq] + vpbroadcastd m10, [v_mul+4*3] + psrad m0, 6 + psrad m5, 6 + packssdw m0, m5 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m10, m5 ; a3 + pmaddwd m10, m6 ; b3 + paddd m7, m8 + paddd m9, m10 + psrad m7, 6 + psrad m9, 6 + packssdw m7, m9 + vpermq m7, m7, q3120 + mova [tmpq+r8*0], xm7 + vextracti128 [tmpq+r8*2], m7, 1 + lea tmpq, [tmpq+r8*4] + sub hd, 2 + jg .hv_w8_loop + add r5, 16 + add r7, 16 + movzx hd, wb + mov srcq, r5 + mov tmpq, r7 + sub wd, 1<<8 + jg .hv_w8_loop0 +%if WIN64 + POP r8 +%endif + RET + +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro REMAP_REG 2 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %xdefine r14_save r14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + %xdefine r14 r14_save + %undef r14_save + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd + movu xm%1, [srcq+ r4*2] + movu xm%2, [srcq+ r6*2] + movu xm%3, [srcq+ r7*2] + movu xm%4, [srcq+ r9*2] + vinserti128 m%1, [srcq+r10*2], 1 + vinserti128 m%2, [srcq+r11*2], 1 + vinserti128 m%3, [srcq+r13*2], 1 + vinserti128 m%4, [srcq+ rX*2], 1 + add srcq, ssq + movu xm%5, [srcq+ r4*2] + movu xm%6, [srcq+ r6*2] + movu xm%7, [srcq+ r7*2] + movu xm%8, [srcq+ r9*2] + vinserti128 m%5, [srcq+r10*2], 1 + vinserti128 m%6, [srcq+r11*2], 1 + vinserti128 m%7, [srcq+r13*2], 1 + vinserti128 m%8, [srcq+ rX*2], 1 + add srcq, ssq + pmaddwd m%1, m12 + pmaddwd m%2, m13 + pmaddwd m%3, m14 + pmaddwd m%4, m15 + pmaddwd m%5, m12 + pmaddwd m%6, m13 + pmaddwd m%7, m14 + pmaddwd m%8, m15 + phaddd m%1, m%2 + %if %9 + mova m10, [rsp+0x00] + %endif + phaddd m%3, m%4 + phaddd m%5, m%6 + phaddd m%7, m%8 + phaddd m%1, m%3 + phaddd m%5, m%7 + paddd m%1, m10 + paddd m%5, m10 + psrad m%1, xm11 + psrad m%5, xm11 + packssdw m%1, m%5 +%endmacro + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isput 1 + %assign isprep 0 +cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %xdefine base_reg r12 + mov r7d, pxmaxm +%else + %assign isput 0 + %assign isprep 1 +cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %define tmp_stridem qword [rsp+0xd0] + %xdefine base_reg r11 +%endif + lea base_reg, [%1_8tap_scaled_16bpc_avx2] +%define base base_reg-%1_8tap_scaled_16bpc_avx2 + tzcnt wd, wm + vpbroadcastd m8, dxm +%if isprep && UNIX64 + movd xm10, mxd + vpbroadcastd m10, xm10 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 + mov r6d, pxmaxm +%else + vpbroadcastd m10, mxm + %if isput + vpbroadcastw m11, pxmaxm + %else + mov r6d, pxmaxm + %endif +%endif + mov dyd, dym +%if isput + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %else + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %endif + %define dsm [rsp+0x98] + %define rX r1 + %define rXd r1d +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %else + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %define hm [rsp+0x98] + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define rX r14 + %define rXd r14d +%endif + shr r7d, 11 + vpbroadcastd m6, [base+pd_0x3ff] + vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4] + movd xm7, [base+s_8tap_h_sh+r7*4] +%if isput + vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4] + pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2 +%else + vpbroadcastd m13, [base+pd_m524256] +%endif + pxor m9, m9 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.w2: + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m10, m8 ; mx+dx*[0,1] + vpbroadcastd xm14, [base+pq_0x40000000+2] + vpbroadcastd xm15, xm15 + pand xm8, xm10, xm6 + psrld xm8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_q] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd xm15, [base+subpel_filters+r4*8+2] + vpbroadcastd xm4, [base+subpel_filters+r6*8+2] + pcmpeqd xm8, xm9 + psrld m10, 10 + paddd m10, m10 + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*1] + movu xm2, [srcq+ssq*2] + movu xm3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m10, m5 + paddb m10, m6 + vpblendd xm15, xm4, 0xa + pblendvb xm15, xm14, xm8 + pmovsxbw m15, xm15 + vinserti128 m0, [srcq+ssq*0], 1 ; 0 4 + vinserti128 m1, [srcq+ssq*1], 1 ; 1 5 + vinserti128 m2, [srcq+ssq*2], 1 ; 2 6 + vinserti128 m3, [srcq+ss3q ], 1 ; 3 7 + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m10}, m0, m1, m2, m3 + REPX {pmaddwd x, m15}, m0, m1, m2, m3 + phaddd m0, m1 + phaddd m2, m3 + paddd m0, m12 + paddd m2, m12 + psrad m0, xm7 + psrad m2, xm7 + packssdw m0, m2 ; 0 1 2 3 4 5 6 7 + vextracti128 xm1, m0, 1 + palignr xm2, xm1, xm0, 4 ; 1 2 3 4 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + pshufd xm4, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm4 ; 45 56 + punpckhwd xm4, xm1, xm4 ; 67 __ +.w2_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm14, r6q + pmovsxbw xm14, xm14 + pshufd xm8, xm14, q0000 + pshufd xm9, xm14, q1111 + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pshufd xm8, xm14, q2222 + pshufd xm14, xm14, q3333 + paddd xm5, xm6 + pmaddwd xm6, xm2, xm8 + pmaddwd xm8, xm4, xm14 + psrldq xm9, xm7, 8 + paddd xm5, xm6 + paddd xm5, xm13 + paddd xm5, xm8 + psrad xm5, xm9 + packusdw xm5, xm5 + pminsw xm5, xm11 + movd [dstq], xm5 + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w2_loop + movu xm5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps xm3, xm0, q1032 ; 01 12 + shufps xm0, xm2, q1032 ; 23 34 + shufps xm2, xm4, q1032 ; 45 56 + pshufb xm5, xm10 + pmaddwd xm5, xm15 + phaddd xm5, xm5 + paddd xm5, xm12 + psrad xm5, xm7 + packssdw xm5, xm5 + palignr xm1, xm5, xm1, 12 + punpcklqdq xm1, xm1 ; 6 7 6 7 + punpcklwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +.w2_skip_line: + movu xm6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xm3, xm0 ; 01 12 + mova xm0, xm2 ; 23 34 + pshufb xm5, xm10 + pshufb xm6, xm10 + pmaddwd xm5, xm15 + pmaddwd xm6, xm15 + phaddd xm5, xm6 + paddd xm5, xm12 + psrad xm5, xm7 + packssdw xm5, xm5 ; 6 7 6 7 + palignr xm1, xm5, xm1, 8 ; 4 5 6 7 + pshufd xm5, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm5 ; 45 56 + punpckhwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +%endif +.w4: + mov myd, mym + mova [rsp+0x00], m12 +%if isput + mova [rsp+0x20], xm13 +%else + SWAP m11, m13 +%endif + mova [rsp+0x30], xm7 + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastq m2, [base+pq_0x40000000+1] + vpbroadcastd xm15, xm15 + SWAP m13, m10 + paddd m13, m8 ; mx+dx*[0-3] + pand m6, m13 + psrld m6, 6 + paddd xm15, xm6 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + vbroadcasti128 m5, [base+bdct_lb_q+ 0] + vbroadcasti128 m1, [base+bdct_lb_q+16] + vbroadcasti128 m0, [base+subpel_s_shuf2] + vpbroadcastd xm14, [base+subpel_filters+r4*8+2] + vpbroadcastd xm7, [base+subpel_filters+r6*8+2] + vpbroadcastd xm15, [base+subpel_filters+r11*8+2] + vpbroadcastd xm8, [base+subpel_filters+r13*8+2] + pcmpeqd m6, m9 + punpckldq m10, m6, m6 + punpckhdq m6, m6 + psrld m13, 10 + paddd m13, m13 + vpblendd xm14, xm7, 0xa + vpblendd xm15, xm8, 0xa + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + pblendvb m14, m2, m10 + pblendvb m15, m2, m6 + pextrd r4, xm13, 2 + pshufb m12, m13, m5 + pshufb m13, m1 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu xm7, [srcq+ssq*0] + movu xm9, [srcq+ssq*1] + movu xm8, [srcq+ssq*2] + movu xm10, [srcq+ss3q ] + movu xm1, [srcq+r4 ] + movu xm3, [srcq+r6 ] + movu xm2, [srcq+r11 ] + movu xm4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + vinserti128 m7, [srcq+ssq*0], 1 + vinserti128 m9, [srcq+ssq*1], 1 + vinserti128 m8, [srcq+ssq*2], 1 + vinserti128 m10, [srcq+ss3q ], 1 + vinserti128 m1, [srcq+r4 ], 1 + vinserti128 m3, [srcq+r6 ], 1 + vinserti128 m2, [srcq+r11 ], 1 + vinserti128 m4, [srcq+r13 ], 1 + lea srcq, [srcq+ssq*4] + vpbroadcastb m5, xm13 + psubb m13, m5 + paddb m12, m0 + paddb m13, m0 + REPX {pshufb x, m12}, m7, m9, m8, m10 + REPX {pmaddwd x, m14}, m7, m9, m8, m10 + REPX {pshufb x, m13}, m1, m2, m3, m4 + REPX {pmaddwd x, m15}, m1, m2, m3, m4 + mova m5, [rsp+0x00] + movd xm6, [rsp+0x30] + phaddd m7, m1 + phaddd m9, m3 + phaddd m8, m2 + phaddd m10, m4 + REPX {paddd x, m5}, m7, m9, m8, m10 + REPX {psrad x, xm6}, m7, m9, m8, m10 + packssdw m7, m9 ; 0 1 4 5 + packssdw m8, m10 ; 2 3 6 7 + vextracti128 xm9, m7, 1 ; 4 5 + vextracti128 xm3, m8, 1 ; 6 7 + shufps xm4, xm7, xm8, q1032 ; 1 2 + shufps xm5, xm8, xm9, q1032 ; 3 4 + shufps xm6, xm9, xm3, q1032 ; 5 6 + psrldq xm10, xm3, 8 ; 7 _ + punpcklwd xm0, xm7, xm4 ; 01 + punpckhwd xm7, xm4 ; 12 + punpcklwd xm1, xm8, xm5 ; 23 + punpckhwd xm8, xm5 ; 34 + punpcklwd xm2, xm9, xm6 ; 45 + punpckhwd xm9, xm6 ; 56 + punpcklwd xm3, xm10 ; 67 + mova [rsp+0x40], xm7 + mova [rsp+0x50], xm8 + mova [rsp+0x60], xm9 +.w4_loop: + and myd, 0x3ff + mov r11d, 64 << 24 + mov r13d, myd + shr r13d, 6 + lea r13d, [t1+r13] + cmovnz r11q, [base+subpel_filters+r13*8] + movq xm9, r11q + pmovsxbw xm9, xm9 + pshufd xm7, xm9, q0000 + pshufd xm8, xm9, q1111 + pmaddwd xm4, xm0, xm7 + pmaddwd xm5, xm1, xm8 + pshufd xm7, xm9, q2222 + pshufd xm9, xm9, q3333 + pmaddwd xm6, xm2, xm7 + pmaddwd xm8, xm3, xm9 +%if isput + mova xm7, [rsp+0x20] + movd xm9, [rsp+0x38] +%else + SWAP m7, m11 +%endif + paddd xm4, xm5 + paddd xm6, xm8 + paddd xm4, xm6 + paddd xm4, xm7 +%if isput + psrad xm4, xm9 + packusdw xm4, xm4 + pminuw xm4, xm11 + movq [dstq], xm4 + add dstq, dsq +%else + SWAP m11, m7 + psrad xm4, 6 + packssdw xm4, xm4 + movq [tmpq], xm4 + add tmpq, 8 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + mova xm8, [rsp+0x00] + movd xm9, [rsp+0x30] + movu xm4, [srcq] + movu xm5, [srcq+r4] + test myd, 0x400 + jz .w4_skip_line + mova xm0, [rsp+0x40] + mova [rsp+0x40], xm1 + mova xm1, [rsp+0x50] + mova [rsp+0x50], xm2 + mova xm2, [rsp+0x60] + mova [rsp+0x60], xm3 + pshufb xm4, xm12 + pshufb xm5, xm13 + pmaddwd xm4, xm14 + pmaddwd xm5, xm15 + phaddd xm4, xm5 + paddd xm4, xm8 + psrad xm4, xm9 + packssdw xm4, xm4 + punpcklwd xm3, xm10, xm4 + mova xm10, xm4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu xm6, [srcq+ssq*1] + movu xm7, [srcq+r6] + movu m0, [rsp+0x50] + pshufb xm4, xm12 + pshufb xm6, xm12 + pshufb xm5, xm13 + pshufb xm7, xm13 + pmaddwd xm4, xm14 + pmaddwd xm6, xm14 + pmaddwd xm5, xm15 + pmaddwd xm7, xm15 + mova [rsp+0x40], m0 + phaddd xm4, xm5 + phaddd xm6, xm7 + paddd xm4, xm8 + paddd xm6, xm8 + psrad xm4, xm9 + psrad xm6, xm9 + packssdw xm4, xm6 + punpcklwd xm9, xm10, xm4 + mova [rsp+0x60], xm9 + psrldq xm10, xm4, 8 + mova xm0, xm1 + mova xm1, xm2 + mova xm2, xm3 + punpcklwd xm3, xm4, xm10 + lea srcq, [srcq+ssq*2] + jmp .w4_loop + SWAP m10, m13 +%if isprep + SWAP m13, m11 +%endif +.w8: + mov dword [rsp+0x80], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [rsp+0x80], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [rsp+0x80], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [rsp+0x80], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [rsp+0x80], 16 + movifprep tmp_stridem, 256 +.w_start: + SWAP m10, m12, m1 + SWAP m11, m7 + ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free +%if isput + movifnidn dsm, dsq + mova [rsp+0xb0], xm7 +%endif + mova [rsp+0x00], m10 + mova [rsp+0x20], m13 + shr t0d, 16 + sub srcq, 6 + pmaddwd m8, [base+rescale_mul2] + movd xm15, t0d + mov [rsp+0x84], t0d + mov [rsp+0x88], srcq + mov [rsp+0x90], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m1, m8 ; mx+dx*[0-7] + jmp .hloop +.hloop_prep: + dec dword [rsp+0x80] + jz .ret + add qword [rsp+0x90], 16 + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m6, [base+pd_0x3ff] + paddd m1, m8, [rsp+0x40] + vpbroadcastd m15, [rsp+0x84] + pxor m9, m9 + mov srcq, [rsp+0x88] + mov r0q, [rsp+0x90] ; dstq / tmpq +.hloop: + vpbroadcastq xm2, [base+pq_0x40000000] + pand m5, m1, m6 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + vextracti128 xm7, m15, 1 + movq r6, xm15 + pextrq r9, xm15, 1 + movq r11, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r7d, r9d + shr r9, 32 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mova [rsp+0x40], m1 + movq xm12, [base+subpel_filters+ r4*8] + movq xm13, [base+subpel_filters+ r6*8] + movhps xm12, [base+subpel_filters+ r7*8] + movhps xm13, [base+subpel_filters+ r9*8] + movq xm14, [base+subpel_filters+r10*8] + movq xm15, [base+subpel_filters+r11*8] + movhps xm14, [base+subpel_filters+r13*8] + movhps xm15, [base+subpel_filters+ rX*8] + psrld m1, 10 + vextracti128 xm7, m1, 1 + vextracti128 xm6, m5, 1 + movq [rsp+0xa0], xm1 + movq [rsp+0xa8], xm7 + movq r6, xm1 + pextrq r11, xm1, 1 + movq r9, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r10d, r11d + shr r11, 32 + mov r7d, r9d + shr r9, 32 + mov r13d, rXd + shr rX, 32 + pshufd xm4, xm5, q2200 + pshufd xm5, xm5, q3311 + pshufd xm7, xm6, q2200 + pshufd xm6, xm6, q3311 + pblendvb xm12, xm2, xm4 + pblendvb xm13, xm2, xm5 + pblendvb xm14, xm2, xm7 + pblendvb xm15, xm2, xm6 + pmovsxbw m12, xm12 + pmovsxbw m13, xm13 + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + mova [rsp+0x60], m0 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b + mova m0, [rsp+0x60] + vbroadcasti128 m9, [base+subpel_s_shuf8] + mov myd, mym + mov dyd, dym + pshufb m0, m9 ; 01a 01b + pshufb m1, m9 ; 23a 23b + pshufb m2, m9 ; 45a 45b + pshufb m3, m9 ; 67a 67b +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm9, r6q + punpcklqdq xm9, xm9 + pmovsxbw m9, xm9 + pshufd m8, m9, q0000 + pshufd m7, m9, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m7 + pshufd m8, m9, q2222 + pshufd m9, m9, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m9 +%if isput + psrldq xm8, xm11, 8 +%endif + paddd m4, [rsp+0x20] + paddd m6, m7 + paddd m4, m5 + paddd m4, m6 +%if isput + psrad m4, xm8 + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, [rsp+0xb0] + mova [dstq], xm4 + add dstq, dsm +%else + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+0x60], myd + mov r4d, [rsp+0xa0] + mov r6d, [rsp+0xa4] + mov r7d, [rsp+0xa8] + mov r9d, [rsp+0xac] + jz .skip_line + vbroadcasti128 m9, [base+wswap] + movu xm4, [srcq+ r4*2] + movu xm5, [srcq+ r6*2] + movu xm6, [srcq+ r7*2] + movu xm7, [srcq+ r9*2] + vinserti128 m4, [srcq+r10*2], 1 + vinserti128 m5, [srcq+r11*2], 1 + vinserti128 m6, [srcq+r13*2], 1 + vinserti128 m7, [srcq+ rX*2], 1 + add srcq, ssq + mov myd, [rsp+0x60] + mov dyd, dym + pshufb m0, m9 + pshufb m1, m9 + pshufb m2, m9 + pshufb m3, m9 + pmaddwd m4, m12 + pmaddwd m5, m13 + pmaddwd m6, m14 + pmaddwd m7, m15 + phaddd m4, m5 + phaddd m6, m7 + phaddd m4, m6 + paddd m4, m10 + psrad m4, xm11 + pslld m4, 16 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .vloop +.skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1 + vbroadcasti128 m9, [base+subpel_s_shuf8] + mov myd, [rsp+0x60] + mov dyd, dym + pshufb m3, m9 + jmp .vloop + SWAP m1, m12, m10 + SWAP m7, m11 +.dy1: + movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.dy1_w2: + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m10, m8 ; mx+dx*[0-1] + vpbroadcastd xm14, [base+pq_0x40000000+2] + vpbroadcastd xm15, xm15 + pand xm8, xm10, xm6 + psrld xm8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_q] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m4, [base+subpel_filters+r6*8+2] + pcmpeqd xm8, xm9 + psrld m10, 10 + paddd m10, m10 + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*1] + movu xm2, [srcq+ssq*2] + movu xm3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m10, m5 + paddb m10, m6 + vpblendd xm15, xm4, 0xa + pblendvb xm15, xm14, xm8 + pmovsxbw m15, xm15 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*1], 1 + vinserti128 m2, [srcq+ssq*2], 1 + add srcq, ss3q + movq xm6, r4q + pmovsxbw xm6, xm6 + pshufd xm8, xm6, q0000 + pshufd xm9, xm6, q1111 + pshufd xm14, xm6, q2222 + pshufd xm6, xm6, q3333 + REPX {pshufb x, m10}, m0, m1, m2 + pshufb xm3, xm10 + REPX {pmaddwd x, m15}, m0, m1, m2 + pmaddwd xm3, xm15 + phaddd m0, m1 + phaddd m2, m3 + paddd m0, m12 + paddd m2, m12 + psrad m0, xm7 + psrad m2, xm7 + packssdw m0, m2 + vextracti128 xm1, m0, 1 + palignr xm2, xm1, xm0, 4 + pshufd xm4, xm1, q2121 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + punpcklwd xm2, xm1, xm4 ; 45 56 +.dy1_w2_loop: + movu xm1, [srcq+ssq*0] + movu xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm1, xm10 + pshufb xm5, xm10 + pmaddwd xm1, xm15 + pmaddwd xm5, xm15 + phaddd xm1, xm5 + pmaddwd xm5, xm3, xm8 + mova xm3, xm0 + pmaddwd xm0, xm9 + paddd xm1, xm12 + psrad xm1, xm7 + packssdw xm1, xm1 + paddd xm5, xm0 + mova xm0, xm2 + pmaddwd xm2, xm14 + paddd xm5, xm2 + palignr xm2, xm1, xm4, 12 + punpcklwd xm2, xm1 ; 67 78 + pmaddwd xm4, xm2, xm6 + paddd xm5, xm13 + paddd xm5, xm4 + mova xm4, xm1 + psrldq xm1, xm7, 8 + psrad xm5, xm1 + packusdw xm5, xm5 + pminsw xm5, xm11 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +.dy1_w4: + mov myd, mym +%if isput + mova [rsp+0x50], xm11 +%endif + mova [rsp+0x00], m12 + mova [rsp+0x20], m13 + mova [rsp+0x40], xm7 + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastq m2, [base+pq_0x40000000+1] + vpbroadcastd xm15, xm15 + SWAP m13, m10 + paddd m13, m8 ; mx+dx*[0-3] + pand m6, m13 + psrld m6, 6 + paddd xm15, xm6 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + vbroadcasti128 m5, [base+bdct_lb_q+ 0] + vbroadcasti128 m1, [base+bdct_lb_q+16] + vbroadcasti128 m4, [base+subpel_s_shuf2] + vpbroadcastd xm14, [base+subpel_filters+r4*8+2] + vpbroadcastd xm7, [base+subpel_filters+r6*8+2] + vpbroadcastd xm15, [base+subpel_filters+r11*8+2] + vpbroadcastd xm8, [base+subpel_filters+r13*8+2] + pcmpeqd m6, m9 + punpckldq m10, m6, m6 + punpckhdq m6, m6 + psrld m13, 10 + paddd m13, m13 + vpblendd xm14, xm7, 0xa + vpblendd xm15, xm8, 0xa + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + pblendvb m14, m2, m10 + pblendvb m15, m2, m6 + pextrd r4, xm13, 2 + pshufb m12, m13, m5 + pshufb m13, m1 + lea r6, [r4+ssq*2] + lea r11, [r4+ssq*1] + lea r13, [r4+ss3q ] + movu xm0, [srcq+ssq*0] + movu xm7, [srcq+r4 ] + movu xm1, [srcq+ssq*2] + movu xm8, [srcq+r6 ] + vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 + vinserti128 m7, [srcq+r11 ], 1 + vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 + vinserti128 m8, [srcq+r13 ], 1 + lea srcq, [srcq+ssq*4] + movu xm2, [srcq+ssq*0] + movu xm9, [srcq+r4 ] + movu xm3, [srcq+ssq*2] ; 6 _ + movu xm10, [srcq+r6 ] + vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 + vinserti128 m9, [srcq+r11 ], 1 + lea srcq, [srcq+ss3q ] + vpbroadcastb m5, xm13 + psubb m13, m5 + paddb m12, m4 + paddb m13, m4 + mova m5, [rsp+0x00] + movd xm6, [rsp+0x40] + pshufb m0, m12 + pshufb m1, m12 + pmaddwd m0, m14 + pmaddwd m1, m14 + pshufb m7, m13 + pshufb m8, m13 + pmaddwd m7, m15 + pmaddwd m8, m15 + pshufb m2, m12 + pshufb xm3, xm12 + pmaddwd m2, m14 + pmaddwd xm3, xm14 + pshufb m9, m13 + pshufb xm10, xm13 + pmaddwd m9, m15 + pmaddwd xm10, xm15 + phaddd m0, m7 + phaddd m1, m8 + phaddd m2, m9 + phaddd xm3, xm10 + paddd m0, m5 + paddd m1, m5 + paddd m2, m5 + paddd xm3, xm5 + psrad m0, xm6 + psrad m1, xm6 + psrad m2, xm6 + psrad xm3, xm6 + vperm2i128 m4, m0, m1, 0x21 ; 1 2 + vperm2i128 m5, m1, m2, 0x21 ; 3 4 + vperm2i128 m6, m2, m3, 0x21 ; 5 6 + shr myd, 6 + mov r13d, 64 << 24 + lea myd, [t1+myq] + cmovnz r13q, [base+subpel_filters+myq*8] + pslld m4, 16 + pslld m5, 16 + pslld m6, 16 + pblendw m0, m4, 0xaa ; 01 12 + pblendw m1, m5, 0xaa ; 23 34 + pblendw m2, m6, 0xaa ; 45 56 + movq xm10, r13q + punpcklqdq xm10, xm10 + pmovsxbw m10, xm10 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 +.dy1_w4_loop: + movu xm11, [srcq+ssq*0] + movu xm6, [srcq+r4 ] + vinserti128 m11, [srcq+ssq*1], 1 + vinserti128 m6, [srcq+r11 ], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pshufb m11, m12 + pshufb m6, m13 + pmaddwd m11, m14 + pmaddwd m6, m15 + paddd m4, [rsp+0x20] + phaddd m11, m6 + pmaddwd m6, m2, m9 + paddd m11, [rsp+0x00] + psrad m11, [rsp+0x40] + mova m0, m1 + mova m1, m2 + paddd m5, m6 + paddd m4, m5 + vinserti128 m2, m3, xm11, 1 + pslld m3, m11, 16 + pblendw m2, m3, 0xaa ; 67 78 + pmaddwd m5, m2, m10 + vextracti128 xm3, m11, 1 + paddd m4, m5 +%if isput + psrad m4, [rsp+0x48] + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, [rsp+0x50] + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] +%else + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy1_w4_loop + MC_8TAP_SCALED_RET + SWAP m10, m13 +.dy1_w8: + mov dword [rsp+0xa0], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [rsp+0xa0], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+0xa0], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+0xa0], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+0xa0], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + SWAP m10, m12, m1 + SWAP m11, m7 + ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free + mov myd, mym +%if isput + %define dsm [rsp+0xb8] + movifnidn dsm, dsq + mova [rsp+0xc0], xm7 +%else + %if UNIX64 + %define hm [rsp+0xb8] + %endif +%endif + mova [rsp+0x00], m10 + mova [rsp+0x20], m13 + mova [rsp+0x40], xm11 + shr t0d, 16 + sub srcq, 6 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pmaddwd m8, [base+rescale_mul2] + movd xm15, t0d + mov [rsp+0xa4], t0d + mov [rsp+0xa8], srcq + mov [rsp+0xb0], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m1, m8 ; mx+dx*[0-7] + movq xm0, r4q + pmovsxbw xm0, xm0 + mova [rsp+0x50], xm0 + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [rsp+0xa0] + jz .ret + add qword [rsp+0xb0], 16 + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m6, [base+pd_0x3ff] + paddd m1, m8, [rsp+0x60] + vpbroadcastd m15, [rsp+0xa4] + pxor m9, m9 + mov srcq, [rsp+0xa8] + mov r0q, [rsp+0xb0] ; dstq / tmpq + mova m10, [rsp+0x00] + mova xm11, [rsp+0x40] +.dy1_hloop: + vpbroadcastq xm2, [base+pq_0x40000000] + pand m5, m1, m6 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + vextracti128 xm7, m15, 1 + movq r6, xm15 + pextrq r9, xm15, 1 + movq r11, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r7d, r9d + shr r9, 32 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mova [rsp+0x60], m1 + movq xm12, [base+subpel_filters+ r4*8] + movq xm13, [base+subpel_filters+ r6*8] + movhps xm12, [base+subpel_filters+ r7*8] + movhps xm13, [base+subpel_filters+ r9*8] + movq xm14, [base+subpel_filters+r10*8] + movq xm15, [base+subpel_filters+r11*8] + movhps xm14, [base+subpel_filters+r13*8] + movhps xm15, [base+subpel_filters+ rX*8] + psrld m1, 10 + vextracti128 xm7, m1, 1 + vextracti128 xm6, m5, 1 + movq r6, xm1 + pextrq r11, xm1, 1 + movq r9, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r10d, r11d + shr r11, 32 + mov r7d, r9d + shr r9, 32 + mov r13d, rXd + shr rX, 32 + pshufd xm4, xm5, q2200 + pshufd xm5, xm5, q3311 + pshufd xm7, xm6, q2200 + pshufd xm6, xm6, q3311 + pblendvb xm12, xm2, xm4 + pblendvb xm13, xm2, xm5 + pblendvb xm14, xm2, xm7 + pblendvb xm15, xm2, xm6 + pmovsxbw m12, xm12 + pmovsxbw m13, xm13 + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + mova [rsp+0x80], m0 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b + mova m0, [rsp+0x80] + vbroadcasti128 m7, [base+subpel_s_shuf8] + vpbroadcastd m8, [rsp+0x50] + vpbroadcastd m9, [rsp+0x54] + vpbroadcastd m10, [rsp+0x58] + vpbroadcastd m11, [rsp+0x5c] + pshufb m0, m7 ; 01a 01b + pshufb m1, m7 ; 23a 23b + pshufb m2, m7 ; 45a 45b + pshufb m3, m7 ; 67a 67b +.dy1_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, [rsp+0x20] + paddd m6, m7 + paddd m4, m5 + paddd m4, m6 +%if isput + psrad m4, [rsp+0x48] + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, [rsp+0xc0] + mova [dstq], xm4 + add dstq, dsm +%else + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep + vbroadcasti128 m7, [base+wswap] + pshufb m0, m7 + pshufb m1, m7 + pshufb m2, m7 + pshufb m3, m7 + movu xm4, [srcq+ r4*2] + movu xm5, [srcq+ r6*2] + movu xm6, [srcq+ r7*2] + movu xm7, [srcq+ r9*2] + vinserti128 m4, [srcq+r10*2], 1 + vinserti128 m5, [srcq+r11*2], 1 + vinserti128 m6, [srcq+r13*2], 1 + vinserti128 m7, [srcq+ rX*2], 1 + add srcq, ssq + pmaddwd m4, m12 + pmaddwd m5, m13 + pmaddwd m6, m14 + pmaddwd m7, m15 + phaddd m4, m5 + phaddd m6, m7 + phaddd m4, m6 + paddd m4, [rsp+0x00] + psrad m4, [rsp+0x40] + pslld m4, 16 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_vloop + SWAP m1, m12, m10 + SWAP m7, m11 +.dy2: + movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.dy2_w2: + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m10, m8 ; mx+dx*[0-1] + vpbroadcastd xm14, [base+pq_0x40000000+2] + vpbroadcastd xm15, xm15 + pand xm8, xm10, xm6 + psrld xm8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_q] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd xm15, [base+subpel_filters+r4*8+2] + vpbroadcastd xm4, [base+subpel_filters+r6*8+2] + pcmpeqd xm8, xm9 + psrld m10, 10 + paddd m10, m10 + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*2] + movu xm2, [srcq+ssq*4] + pshufb m10, m5 + paddb m10, m6 + vpblendd xm15, xm4, 0xa + pblendvb xm15, xm14, xm8 + pmovsxbw m15, xm15 + vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 + vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 + lea srcq, [srcq+ssq*4] + vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m0, m10 + pshufb m1, m10 + pshufb m2, m10 + pmaddwd m0, m15 + pmaddwd m1, m15 + pmaddwd m2, m15 + movq xm6, r4q + pmovsxbw xm6, xm6 + phaddd m0, m1 + phaddd m1, m2 + paddd m0, m12 + paddd m1, m12 + psrad m0, xm7 + psrad m1, xm7 + packssdw m0, m1 ; 0 2 2 4 1 3 3 5 + vextracti128 xm1, m0, 1 + pshufd xm8, xm6, q0000 + pshufd xm9, xm6, q1111 + pshufd xm14, xm6, q2222 + pshufd xm6, xm6, q3333 + punpcklwd xm2, xm0, xm1 ; 01 23 + punpckhwd xm1, xm0, xm1 ; 23 45 +.dy2_w2_loop: + movu xm3, [srcq+ssq*0] + movu xm5, [srcq+ssq*2] + vinserti128 m3, [srcq+ssq*1], 1 ; 6 7 + vinserti128 m5, [srcq+ss3q ], 1 ; 8 9 + lea srcq, [srcq+ssq*4] + pmaddwd xm4, xm2, xm8 + pmaddwd xm1, xm9 + pshufb m3, m10 + pshufb m5, m10 + pmaddwd m3, m15 + pmaddwd m5, m15 + phaddd m3, m5 + paddd xm4, xm1 + paddd m3, m12 + psrad m3, xm7 + packssdw m3, m3 + pshufd m3, m3, q2100 + palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9 + vextracti128 xm1, m0, 1 + punpcklwd xm2, xm0, xm1 ; 45 67 + punpckhwd xm1, xm0, xm1 ; 67 89 + pmaddwd xm3, xm2, xm14 + pmaddwd xm5, xm1, xm6 + paddd xm4, xm13 + paddd xm4, xm3 + psrldq xm3, xm7, 8 + paddd xm4, xm5 + psrad xm4, xm3 + packusdw xm4, xm4 + pminsw xm4, xm11 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +.dy2_w4: + mov myd, mym +%if isput + mova [rsp+0x50], xm11 +%endif + mova [rsp+0x00], m12 + mova [rsp+0x20], m13 + mova [rsp+0x40], xm7 + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + sub srcq, 2 + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastq m2, [base+pq_0x40000000+1] + vpbroadcastd xm15, xm15 + SWAP m13, m10 + paddd m13, m8 ; mx+dx*[0-3] + pand m6, m13 + psrld m6, 6 + paddd xm15, xm6 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + vbroadcasti128 m5, [base+bdct_lb_q+ 0] + vbroadcasti128 m1, [base+bdct_lb_q+16] + vbroadcasti128 m4, [base+subpel_s_shuf2] + vpbroadcastd xm14, [base+subpel_filters+r4*8+2] + vpbroadcastd xm7, [base+subpel_filters+r6*8+2] + vpbroadcastd xm15, [base+subpel_filters+r11*8+2] + vpbroadcastd xm8, [base+subpel_filters+r13*8+2] + shr myd, 6 + mov r13d, 64 << 24 + lea myd, [t1+myq] + cmovnz r13q, [base+subpel_filters+myq*8] + pcmpeqd m6, m9 + punpckldq m11, m6, m6 + punpckhdq m6, m6 + psrld m13, 10 + paddd m13, m13 + vpblendd xm14, xm7, 0xa + vpblendd xm15, xm8, 0xa + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + movq xm10, r13q + pblendvb m14, m2, m11 + pblendvb m15, m2, m6 + pextrd r4, xm13, 2 + pshufb m12, m13, m5 + pshufb m13, m1 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu xm0, [srcq+ssq*0] + movu xm7, [srcq+r4 ] + movu xm1, [srcq+ssq*1] + movu xm8, [srcq+r6 ] + vinserti128 m0, [srcq+ssq*2], 1 ; 0 2 + vinserti128 m7, [srcq+r11 ], 1 + vinserti128 m1, [srcq+ss3q ], 1 ; 1 3 + vinserti128 m8, [srcq+r13 ], 1 + lea srcq, [srcq+ssq*4] + movu xm2, [srcq+ssq*0] + movu xm9, [srcq+r4 ] + vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 + vinserti128 m9, [srcq+r6 ], 1 + lea srcq, [srcq+ssq*2] + vpbroadcastb m5, xm13 + psubb m13, m5 + paddb m12, m4 + paddb m13, m4 + mova m5, [rsp+0x00] + movd xm6, [rsp+0x40] + pshufb m0, m12 + pshufb m1, m12 + pshufb m2, m12 + pmaddwd m0, m14 + pmaddwd m1, m14 + pmaddwd m2, m14 + pshufb m7, m13 + pshufb m8, m13 + pshufb m9, m13 + pmaddwd m7, m15 + pmaddwd m8, m15 + pmaddwd m9, m15 + punpcklqdq xm10, xm10 + pmovsxbw m10, xm10 + phaddd m0, m7 + phaddd m1, m8 + phaddd m2, m9 + paddd m0, m5 + paddd m1, m5 + paddd m2, m5 + psrad m0, xm6 + psrad m1, xm6 + psrad m2, xm6 + vperm2i128 m3, m0, m2, 0x21 ; 2 4 + vperm2i128 m2, m1, 0x13 ; 3 5 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + packssdw m0, m3 ; 0 2 2 4 + packssdw m1, m2 ; 1 3 3 5 + punpckhwd m2, m0, m1 ; 23 45 + punpcklwd m0, m1 ; 01 23 +.dy2_w4_loop: + movu xm1, [srcq+ssq*0] + movu xm6, [srcq+r4 ] + movu xm3, [srcq+ssq*1] + movu xm11, [srcq+r6 ] + vinserti128 m1, [srcq+ssq*2], 1 ; 6 8 + vinserti128 m6, [srcq+r11 ], 1 + vinserti128 m3, [srcq+ss3q ], 1 ; 7 9 + vinserti128 m11, [srcq+r13 ], 1 + lea srcq, [srcq+ssq*4] + pmaddwd m4, m0, m7 + pmaddwd m5, m2, m8 + pshufb m1, m12 + pshufb m3, m12 + pmaddwd m1, m14 + pmaddwd m3, m14 + mova m0, [rsp+0x00] + pshufb m6, m13 + pshufb m11, m13 + pmaddwd m6, m15 + pmaddwd m11, m15 + paddd m4, m5 + movd xm5, [rsp+0x40] + phaddd m1, m6 + phaddd m3, m11 + paddd m1, m0 + paddd m3, m0 + psrad m1, xm5 + psrad m3, xm5 + pslld m3, 16 + pblendw m1, m3, 0xaa ; 67 89 + vperm2i128 m0, m2, m1, 0x21 ; 45 67 + paddd m4, [rsp+0x20] + mova m2, m1 + pmaddwd m5, m0, m9 + pmaddwd m6, m2, m10 + paddd m4, m5 + paddd m4, m6 +%if isput + psrad m4, [rsp+0x48] + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, [rsp+0x50] + movq [dstq+dsq*0], xm4 + movhps [dstq+dsq*1], xm4 + lea dstq, [dstq+dsq*2] +%else + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET + SWAP m10, m13 +.dy2_w8: + mov dword [rsp+0xa0], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [rsp+0xa0], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+0xa0], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+0xa0], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+0xa0], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + SWAP m10, m12, m1 + SWAP m11, m7 + ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free + mov myd, mym +%if isput + movifnidn dsm, dsq + mova [rsp+0xc0], xm7 +%endif + mova [rsp+0x00], m10 + mova [rsp+0x20], m13 + mova [rsp+0x40], xm11 + shr t0d, 16 + sub srcq, 6 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pmaddwd m8, [base+rescale_mul2] + movd xm15, t0d + mov [rsp+0xa4], t0d + mov [rsp+0xa8], srcq + mov [rsp+0xb0], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m1, m8 ; mx+dx*[0-7] + movq xm0, r4q + pmovsxbw xm0, xm0 + mova [rsp+0x50], xm0 + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [rsp+0xa0] + jz .ret + add qword [rsp+0xb0], 16 + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m6, [base+pd_0x3ff] + paddd m1, m8, [rsp+0x60] + vpbroadcastd m15, [rsp+0xa4] + pxor m9, m9 + mov srcq, [rsp+0xa8] + mov r0q, [rsp+0xb0] ; dstq / tmpq + mova m10, [rsp+0x00] + mova xm11, [rsp+0x40] +.dy2_hloop: + vpbroadcastq xm2, [base+pq_0x40000000] + pand m5, m1, m6 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + vextracti128 xm7, m15, 1 + movq r6, xm15 + pextrq r9, xm15, 1 + movq r11, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r7d, r9d + shr r9, 32 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mova [rsp+0x60], m1 + movq xm12, [base+subpel_filters+ r4*8] + movq xm13, [base+subpel_filters+ r6*8] + movhps xm12, [base+subpel_filters+ r7*8] + movhps xm13, [base+subpel_filters+ r9*8] + movq xm14, [base+subpel_filters+r10*8] + movq xm15, [base+subpel_filters+r11*8] + movhps xm14, [base+subpel_filters+r13*8] + movhps xm15, [base+subpel_filters+ rX*8] + psrld m1, 10 + vextracti128 xm7, m1, 1 + vextracti128 xm6, m5, 1 + movq r6, xm1 + pextrq r11, xm1, 1 + movq r9, xm7 + pextrq rX, xm7, 1 + mov r4d, r6d + shr r6, 32 + mov r10d, r11d + shr r11, 32 + mov r7d, r9d + shr r9, 32 + mov r13d, rXd + shr rX, 32 + pshufd xm4, xm5, q2200 + pshufd xm5, xm5, q3311 + pshufd xm7, xm6, q2200 + pshufd xm6, xm6, q3311 + pblendvb xm12, xm2, xm4 + pblendvb xm13, xm2, xm5 + pblendvb xm14, xm2, xm7 + pblendvb xm15, xm2, xm6 + pmovsxbw m12, xm12 + pmovsxbw m13, xm13 + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + mova [rsp+0x80], m0 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b + mova m0, [rsp+0x80] + vbroadcasti128 m7, [base+subpel_s_shuf8] + vpbroadcastd m8, [rsp+0x50] + vpbroadcastd m9, [rsp+0x54] + vpbroadcastd m10, [rsp+0x58] + vpbroadcastd m11, [rsp+0x5c] + pshufb m0, m7 ; 01a 01b + pshufb m1, m7 ; 23a 23b + pshufb m2, m7 ; 45a 45b + pshufb m3, m7 ; 67a 67b +.dy2_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, [rsp+0x20] + paddd m6, m7 + paddd m4, m5 + paddd m4, m6 +%if isput + psrad m4, [rsp+0x48] + vextracti128 xm5, m4, 1 + packusdw xm4, xm5 + pminsw xm4, [rsp+0xc0] + mova [dstq], xm4 + add dstq, dsm +%else + psrad m4, 6 + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep + mova m0, m1 + mova m1, m2 + mova m2, m3 + movu xm3, [srcq+ r4*2] + movu xm4, [srcq+ r6*2] + movu xm5, [srcq+ r7*2] + movu xm6, [srcq+ r9*2] + vinserti128 m3, [srcq+r10*2], 1 + vinserti128 m4, [srcq+r11*2], 1 + vinserti128 m5, [srcq+r13*2], 1 + vinserti128 m6, [srcq+ rX*2], 1 + add srcq, ssq + pmaddwd m3, m12 + pmaddwd m4, m13 + pmaddwd m5, m14 + pmaddwd m6, m15 + phaddd m3, m4 + phaddd m5, m6 + phaddd m3, m5 + movu xm4, [srcq+ r4*2] + movu xm5, [srcq+ r6*2] + movu xm6, [srcq+ r7*2] + movu xm7, [srcq+ r9*2] + vinserti128 m4, [srcq+r10*2], 1 + vinserti128 m5, [srcq+r11*2], 1 + vinserti128 m6, [srcq+r13*2], 1 + vinserti128 m7, [srcq+ rX*2], 1 + add srcq, ssq + pmaddwd m4, m12 + pmaddwd m5, m13 + pmaddwd m6, m14 + pmaddwd m7, m15 + phaddd m4, m5 + phaddd m6, m7 + mova m5, [rsp+0x00] + movd xm7, [rsp+0x40] + phaddd m4, m6 + paddd m3, m5 + paddd m4, m5 + psrad m3, xm7 + psrad m4, xm7 + pslld m4, 16 + pblendw m3, m4, 0xaa + jmp .dy2_vloop +.ret: + MC_8TAP_SCALED_RET 0 +%undef isput +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled_16bpc + mov t0d, (5*15 << 16) | 5*15 + mov t1d, t0d + jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%else +DECLARE_REG_TMP 6, 8 +%endif + +%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, +BILIN_SCALED_FN put +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, +BILIN_SCALED_FN prep +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR +MC_8TAP_SCALED prep + +%macro WARP_V 5 ; dst, 01, 23, 45, 67 + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm8, [filterq+myq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 ; a e + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+deltaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; b f + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm9, [filterq+myq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 ; c g + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+gammaq] ; my += gamma + punpcklwd m8, m0 + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; d h + punpcklwd m0, m9, m0 + punpckldq m9, m8, m0 + punpckhdq m0, m8, m0 + punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 + punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 + pmaddwd m%2, m8 + pmaddwd m9, m%3 + punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 + punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 + pmaddwd m8, m%4 + pmaddwd m0, m%5 + paddd m9, m%2 + mova m%2, m%3 + paddd m0, m8 + mova m%3, m%4 + mova m%4, m%5 + paddd m%1, m0, m9 +%endmacro + +cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts + mov r6d, r7m + lea r9, [$$] + shr r6d, 11 + vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4] + vpbroadcastd m14, [warp8x8t_rnd] + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main + jmp .start +.loop: + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2 + lea tmpq, [tmpq+tsq*4] +.start: + paddd m7, m14 + paddd m0, m14 + psrad m7, 15 + psrad m0, 15 + packssdw m7, m0 + vpermq m7, m7, q3120 + mova [tmpq+tsq*0], xm7 + vextracti128 [tmpq+tsq*2], m7, 1 + dec r4d + jg .loop +.end: + RET + +cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \ + alpha, beta, filter, tmp1, delta, \ + my, gamma + mov r6d, r7m + lea filterq, [$$] + shr r6d, 11 + vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4] + vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4] + vpbroadcastw m15, r7m ; pixel_max + call .main + jmp .start +.loop: + call .main2 + lea dstq, [dstq+dsq*2] +.start: + psrad m7, 16 + psrad m0, 16 + packusdw m7, m0 + pmulhrsw m7, m14 + pminsw m7, m15 + vpermq m7, m7, q3120 + mova [dstq+dsq*0], xm7 + vextracti128 [dstq+dsq*1], m7, 1 + dec r4d + jg .loop +.end: + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov abcdq, r5m + mov mxd, r6m +%endif + movsx alphad, word [abcdq+2*0] + movsx betad, word [abcdq+2*1] + vpbroadcastd m12, [pd_32768] + pxor m11, m11 + add filterq, mc_warp_filter-$$ + lea tmp1q, [ssq*3] + add mxd, 512+(64<<10) + lea tmp2d, [alphaq*3] + sub srcq, tmp1q ; src -= src_stride*3 + sub betad, tmp2d ; beta -= alpha*3 + mov myd, r7m + call .h + psrld m1, m0, 16 + call .h + pblendw m1, m0, 0xaa ; 01 + psrld m2, m0, 16 + call .h + pblendw m2, m0, 0xaa ; 12 + psrld m3, m0, 16 + call .h + pblendw m3, m0, 0xaa ; 23 + psrld m4, m0, 16 + call .h + pblendw m4, m0, 0xaa ; 34 + psrld m5, m0, 16 + call .h + pblendw m5, m0, 0xaa ; 45 + psrld m6, m0, 16 + call .h + pblendw m6, m0, 0xaa ; 56 + movsx deltad, word [abcdq+2*2] + movsx gammad, word [abcdq+2*3] + add myd, 512+(64<<10) + mov r4d, 4 + lea tmp1d, [deltaq*3] + sub gammad, tmp1d ; gamma -= delta*3 +.main2: + call .h + psrld m7, m6, 16 + pblendw m7, m0, 0xaa ; 67 + WARP_V 7, 1, 3, 5, 7 + call .h + psrld m10, m5, 16 + pblendw m10, m0, 0xaa ; 78 + WARP_V 0, 2, 4, 6, 10 + ret +ALIGN function_align +.h: + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + movu xm10, [srcq-6] + vinserti128 m10, [srcq+2], 1 + shr mxd, 10 ; 0 + shr tmp1d, 10 ; 4 + movq xm0, [filterq+mxq *8] + vinserti128 m0, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+alphaq*1] + movu xm8, [srcq-4] + vinserti128 m8, [srcq+4], 1 + shr tmp2d, 10 ; 1 + shr tmp1d, 10 ; 5 + movq xm9, [filterq+tmp2q*8] + vinserti128 m9, [filterq+tmp1q*8], 1 + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + shr mxd, 10 ; 2 + shr tmp1d, 10 ; 6 + punpcklbw m0, m11, m0 + pmaddwd m0, m10 + movu xm10, [srcq-2] + vinserti128 m10, [srcq+6], 1 + punpcklbw m9, m11, m9 + pmaddwd m9, m8 + movq xm8, [filterq+mxq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+betaq] ; mx += beta + phaddd m0, m9 ; 0 1 4 5 + movu xm9, [srcq+0] + vinserti128 m9, [srcq+8], 1 + shr tmp2d, 10 ; 3 + shr tmp1d, 10 ; 7 + punpcklbw m8, m11, m8 + pmaddwd m8, m10 + movq xm10, [filterq+tmp2q*8] + vinserti128 m10, [filterq+tmp1q*8], 1 + punpcklbw m10, m11, m10 + pmaddwd m9, m10 + add srcq, ssq + phaddd m8, m9 ; 2 3 6 7 + phaddd m0, m8 ; 0 1 2 3 4 5 6 7 + vpsllvd m0, m13 + paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword + ret + +%macro BIDIR_FN 0 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + movq [dstq ], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + cmp hd, 8 + je .ret + lea dstq, [dstq+strideq*4] + movq [dstq ], xm2 + movhps [dstq+strideq*1], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq ], xm3 + movhps [dstq+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [dstq+strideq*2], xm3 + movhps [dstq+stride3q ], xm3 +.ret: + RET +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + cmp hd, 4 + jne .w8_loop_start + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 +.w8_loop_start: + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm2 + vextracti128 [dstq+strideq*1], m2, 1 + mova [dstq+strideq*2], xm3 + vextracti128 [dstq+stride3q ], m3, 1 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + mova [dstq+32*6], m2 + mova [dstq+32*7], m3 + dec hd + jg .w128_loop + RET +%endmacro + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg_avx2_table + lea r6, [avg_avx2_table] + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + vpbroadcastd m4, [base+bidir_rnd+t0*4] + vpbroadcastd m5, [base+bidir_mul+t0*4] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+32*0] + paddsw m0, [tmp2q+32*0] + mova m1, [tmp1q+32*1] + paddsw m1, [tmp2q+32*1] + mova m2, [tmp1q+32*2] + paddsw m2, [tmp2q+32*2] + mova m3, [tmp1q+32*3] + paddsw m3, [tmp2q+32*3] + add tmp1q, 32*4 + add tmp2q, 32*4 + pmaxsw m0, m4 + pmaxsw m1, m4 + pmaxsw m2, m4 + pmaxsw m3, m4 + psubsw m0, m4 + psubsw m1, m4 + psubsw m2, m4 + psubsw m3, m4 + pmulhw m0, m5 + pmulhw m1, m5 + pmulhw m2, m5 + pmulhw m3, m5 + ret + +cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3 + lea r6, [w_avg_avx2_table] + tzcnt wd, wm + mov t0d, r6m ; weight + vpbroadcastw m8, r7m ; pixel_max + vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538] + movsxd wq, [r6+wq*4] + paddw m7, m8 + add wq, r6 + lea r6d, [t0-16] + shl t0d, 16 + sub t0d, r6d ; 16-weight, weight + pslld m7, 7 + rorx r6d, t0d, 30 ; << 2 + test dword r7m, 0x800 + cmovz r6d, t0d + movifnidn hd, hm + movd xm6, r6d + vpbroadcastd m6, xm6 + BIDIR_FN +ALIGN function_align +.main: + mova m4, [tmp1q+32*0] + mova m0, [tmp2q+32*0] + punpckhwd m5, m0, m4 + punpcklwd m0, m4 + mova m4, [tmp1q+32*1] + mova m1, [tmp2q+32*1] + pmaddwd m5, m6 + pmaddwd m0, m6 + paddd m5, m7 + paddd m0, m7 + psrad m5, 8 + psrad m0, 8 + packusdw m0, m5 + punpckhwd m5, m1, m4 + punpcklwd m1, m4 + mova m4, [tmp1q+32*2] + mova m2, [tmp2q+32*2] + pmaddwd m5, m6 + pmaddwd m1, m6 + paddd m5, m7 + paddd m1, m7 + psrad m5, 8 + psrad m1, 8 + packusdw m1, m5 + punpckhwd m5, m2, m4 + punpcklwd m2, m4 + mova m4, [tmp1q+32*3] + mova m3, [tmp2q+32*3] + add tmp1q, 32*4 + add tmp2q, 32*4 + pmaddwd m5, m6 + pmaddwd m2, m6 + paddd m5, m7 + paddd m2, m7 + psrad m5, 8 + psrad m2, 8 + packusdw m2, m5 + punpckhwd m5, m3, m4 + punpcklwd m3, m4 + pmaddwd m5, m6 + pmaddwd m3, m6 + paddd m5, m7 + paddd m3, m7 + psrad m5, 8 + psrad m3, 8 + packusdw m3, m5 + pminsw m0, m8 + pminsw m1, m8 + pminsw m2, m8 + pminsw m3, m8 + ret + +cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask_avx2_table + lea r7, [mask_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_64] + vpbroadcastd m9, [base+bidir_rnd+r6*4] + vpbroadcastd m10, [base+bidir_mul+r6*4] + mov maskq, maskmp + add wq, r7 + BIDIR_FN +ALIGN function_align +.main: +%macro MASK 1 + pmovzxbw m5, [maskq+16*%1] + mova m%1, [tmp1q+32*%1] + mova m6, [tmp2q+32*%1] + punpckhwd m4, m%1, m6 + punpcklwd m%1, m6 + psubw m7, m8, m5 + punpckhwd m6, m5, m7 ; m, 64-m + punpcklwd m5, m7 + pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m%1, m5 + psrad m4, 5 + psrad m%1, 5 + packssdw m%1, m4 + pmaxsw m%1, m9 + psubsw m%1, m9 + pmulhw m%1, m10 +%endmacro + MASK 0 + MASK 1 + MASK 2 + MASK 3 + add maskq, 16*4 + add tmp1q, 32*4 + add tmp2q, 32*4 + ret + +cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx2_table + lea r7, [w_mask_420_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd xm0, r7m ; sign + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m11, [base+pw_64] + vpbroadcastd m12, [base+bidir_rnd+r6*4] + vpbroadcastd m13, [base+bidir_mul+r6*4] + movd xm14, [base+pw_2] + mov maskq, maskmp + psubw xm14, xm0 + vpbroadcastw m14, xm14 + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + phaddd m4, m5 + paddw m4, m14 + psrlw m4, 2 + packuswb m4, m4 + vextracti128 xm5, m4, 1 + punpcklwd xm4, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + mova [maskq], xm4 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm3 + movhps [dstq+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [dstq+strideq*2], xm3 + movhps [dstq+stride3q ], xm3 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w8: + vperm2i128 m6, m4, m5, 0x21 + vpblendd m4, m5, 0xf0 + paddw m4, m14 + paddw m4, m6 + psrlw m4, 2 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + mova [maskq], xm4 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm2 + vextracti128 [dstq+strideq*1], m2, 1 + mova [dstq+strideq*2], xm3 + vextracti128 [dstq+stride3q ], m3, 1 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w16: + punpcklqdq m6, m4, m5 + punpckhqdq m4, m5 + paddw m6, m14 + paddw m4, m6 + psrlw m4, 2 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + pshufd xm4, xm4, q3120 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + mova [maskq], xm4 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w32: + paddw m4, m14 + paddw m4, m5 + psrlw m15, m4, 2 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + call .main + mova m6, [deint_shuf] + paddw m4, m14 + paddw m4, m5 + psrlw m4, 2 + packuswb m15, m4 + vpermd m4, m6, m15 + mova [dstq+strideq*2+32*0], m0 + mova [dstq+strideq*2+32*1], m1 + mova [dstq+stride3q +32*0], m2 + mova [dstq+stride3q +32*1], m3 + mova [maskq], m4 + sub hd, 4 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w64: + paddw m4, m14 + paddw m15, m14, m5 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*0+32*2], m2 + mova [dstq+strideq*0+32*3], m3 + mova [maskq], m4 ; no available registers + call .main + paddw m4, [maskq] + mova m6, [deint_shuf] + paddw m5, m15 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 ; 0 2 4 6 1 3 5 7 + vpermd m4, m6, m4 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m2 + mova [dstq+strideq*1+32*3], m3 + mova [maskq], m4 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 64 +.w128: + paddw m4, m14 + paddw m5, m14 + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*0+32*2], m2 + mova [dstq+strideq*0+32*3], m3 + mova [maskq+32*0], m4 + mova [dstq+strideq], m5 + call .main + paddw m4, m14 + paddw m15, m14, m5 + mova [dstq+strideq*0+32*4], m0 + mova [dstq+strideq*0+32*5], m1 + mova [dstq+strideq*0+32*6], m2 + mova [dstq+strideq*0+32*7], m3 + mova [maskq+32*1], m4 + call .main + paddw m4, [maskq+32*0] + paddw m5, [dstq+strideq] + mova m6, [deint_shuf] + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m6, m4 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m2 + mova [dstq+strideq*1+32*3], m3 + mova [maskq+32*0], m4 + call .main + paddw m4, [maskq+32*1] + mova m6, [deint_shuf] + paddw m5, m15 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m6, m4 + mova [dstq+strideq*1+32*4], m0 + mova [dstq+strideq*1+32*5], m1 + mova [dstq+strideq*1+32*6], m2 + mova [dstq+strideq*1+32*7], m3 + mova [maskq+32*1], m4 + sub hd, 2 + jg .w128_loop + RET +ALIGN function_align +.main: +%macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul + mova m%1, [tmp1q+32*%1] + mova m%2, [tmp2q+32*%1] + punpcklwd m8, m%2, m%1 + punpckhwd m9, m%2, m%1 + psubsw m%1, m%2 + pabsw m%1, m%1 + psubusw m7, m10, m%1 + psrlw m7, 10 ; 64-m + psubw m%2, m%3, m7 ; m + punpcklwd m%1, m7, m%2 + punpckhwd m7, m%2 + pmaddwd m%1, m8 + pmaddwd m7, m9 + psrad m%1, 5 + psrad m7, 5 + packssdw m%1, m7 + pmaxsw m%1, m%4 + psubsw m%1, m%4 + pmulhw m%1, m%5 +%endmacro + W_MASK 0, 4 + W_MASK 1, 5 + phaddw m4, m5 + W_MASK 2, 5 + W_MASK 3, 6 + phaddw m5, m6 + add tmp1q, 32*4 + add tmp2q, 32*4 + ret + +cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx2_table + lea r7, [w_mask_422_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + vpbroadcastb m14, r7m ; sign + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] + vpbroadcastd m11, [base+pw_64] + vpbroadcastd m12, [base+bidir_rnd+r6*4] + vpbroadcastd m13, [base+bidir_mul+r6*4] + mova m15, [base+deint_shuf] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti128 xm2, m2, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm3 + movhps [dstq+strideq*1], xm3 + vextracti128 xm3, m3, 1 + movq [dstq+strideq*2], xm3 + movhps [dstq+stride3q ], xm3 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm2 + vextracti128 [dstq+strideq*1], m2, 1 + mova [dstq+strideq*2], xm3 + vextracti128 [dstq+stride3q ], m3, 1 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m3 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m2 + mova [dstq+strideq*1+32*1], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + mova [dstq+32*6], m2 + mova [dstq+32*7], m3 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 4 + W_MASK 1, 5 + phaddw m4, m5 + W_MASK 2, 5 + W_MASK 3, 6 + phaddw m5, m6 + add tmp1q, 32*4 + add tmp2q, 32*4 + packuswb m4, m5 + pxor m5, m5 + psubb m4, m14 + pavgb m4, m5 + vpermd m4, m15, m4 + mova [maskq], m4 + add maskq, 32 + ret + +cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx2_table + lea r7, [w_mask_444_avx2_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] + vpbroadcastd m4, [base+pw_64] + vpbroadcastd m5, [base+bidir_rnd+r6*4] + vpbroadcastd m6, [base+bidir_mul+r6*4] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 + je .w4_end + call .main + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti128 xm0, m0, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti128 xm1, m1, 1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + call .main + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + call .main + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 + call .main + mova [dstq+32*4], m0 + mova [dstq+32*5], m1 + call .main + mova [dstq+32*6], m0 + mova [dstq+32*7], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2, 4, 5, 6 + W_MASK 1, 3, 4, 5, 6 + packuswb m2, m3 + vpermq m2, m2, q3120 + add tmp1q, 32*2 + add tmp2q, 32*2 + mova [maskq], m2 + add maskq, 32 + ret + +; (a * (64 - m) + b * m + 32) >> 6 +; = (((b - a) * m + 32) >> 6) + a +; = (((b - a) * (m << 9) + 16384) >> 15) + a +; except m << 9 overflows int16_t when m == 64 (which is possible), +; but if we negate m it works out (-64 << 9 == -32768). +; = (((a - b) * (m * -512) + 16384) >> 15) + a +cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_avx2_table + lea r6, [blend_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + vpbroadcastd m6, [base+pw_m512] + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + pmovzxbw m3, [maskq] + movq xm0, [dstq+dsq*0] + movhps xm0, [dstq+dsq*1] + vpbroadcastq m1, [dstq+dsq*2] + vpbroadcastq m2, [dstq+r6 ] + vpblendd m0, m1, 0x30 + vpblendd m0, m2, 0xc0 + psubw m1, m0, [tmpq] + add maskq, 16 + add tmpq, 32 + pmullw m3, m6 + pmulhrsw m1, m3 + paddw m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + movq [dstq+dsq*2], xm1 + movhps [dstq+r6 ], xm1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + RET +.w8: + pmovzxbw m4, [maskq+16*0] + pmovzxbw m5, [maskq+16*1] + mova xm0, [dstq+dsq*0] + vinserti128 m0, [dstq+dsq*1], 1 + mova xm1, [dstq+dsq*2] + vinserti128 m1, [dstq+r6 ], 1 + psubw m2, m0, [tmpq+32*0] + psubw m3, m1, [tmpq+32*1] + add maskq, 16*2 + add tmpq, 32*2 + pmullw m4, m6 + pmullw m5, m6 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + mova [dstq+dsq*2], xm1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + RET +.w16: + pmovzxbw m4, [maskq+16*0] + pmovzxbw m5, [maskq+16*1] + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 32*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 32*1] + add maskq, 16*2 + add tmpq, 32*2 + pmullw m4, m6 + pmullw m5, m6 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16 + RET +.w32: + pmovzxbw m4, [maskq+16*0] + pmovzxbw m5, [maskq+16*1] + mova m0, [dstq+32*0] + psubw m2, m0, [tmpq+32*0] + mova m1, [dstq+32*1] + psubw m3, m1, [tmpq+32*1] + add maskq, 16*2 + add tmpq, 32*2 + pmullw m4, m6 + pmullw m5, m6 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + add dstq, dsq + dec hd + jg .w32 + RET + +INIT_XMM avx2 +cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h +%define base r5-blend_v_avx2_table + lea r5, [blend_v_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + vpbroadcastd m2, [base+obmc_masks_avx2+2*2] +.w2_loop: + movd m0, [dstq+dsq*0] + pinsrd m0, [dstq+dsq*1], 1 + movq m1, [tmpq] + add tmpq, 4*2 + psubw m1, m0, m1 + pmulhrsw m1, m2 + paddw m0, m1 + movd [dstq+dsq*0], m0 + pextrd [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + vpbroadcastq m2, [base+obmc_masks_avx2+4*2] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + psubw m1, m0, [tmpq] + add tmpq, 8*2 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +INIT_YMM avx2 +.w8: + vbroadcasti128 m2, [base+obmc_masks_avx2+8*2] +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti128 m0, [dstq+dsq*1], 1 + psubw m1, m0, [tmpq] + add tmpq, 16*2 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + mova m4, [base+obmc_masks_avx2+16*2] +.w16_loop: + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 32*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 32*1] + add tmpq, 32*2 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: +%if WIN64 + movaps [rsp+ 8], xmm6 + movaps [rsp+24], xmm7 +%endif + mova m6, [base+obmc_masks_avx2+32*2] + vbroadcasti128 m7, [base+obmc_masks_avx2+32*3] +.w32_loop: + mova m0, [dstq+dsq*0+32*0] + psubw m3, m0, [tmpq +32*0] + mova xm2, [dstq+dsq*0+32*1] + mova xm5, [tmpq +32*1] + mova m1, [dstq+dsq*1+32*0] + psubw m4, m1, [tmpq +32*2] + vinserti128 m2, [dstq+dsq*1+32*1], 1 + vinserti128 m5, [tmpq +32*3], 1 + add tmpq, 32*4 + psubw m5, m2, m5 + pmulhrsw m3, m6 + pmulhrsw m4, m6 + pmulhrsw m5, m7 + paddw m0, m3 + paddw m1, m4 + paddw m2, m5 + mova [dstq+dsq*0+32*0], m0 + mova [dstq+dsq*1+32*0], m1 + mova [dstq+dsq*0+32*1], xm2 + vextracti128 [dstq+dsq*1+32*1], m2, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32_loop +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + RET + +%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp + mova m0, [dstq+32*(%1+0)] + psubw m2, m0, [tmpq+32*(%2+0)] + mova m1, [dstq+32*(%1+1)] + psubw m3, m1, [tmpq+32*(%2+1)] +%if %3 + add tmpq, 32*%3 +%endif + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+32*(%1+0)], m0 + mova [dstq+32*(%1+1)], m1 +%endmacro + +INIT_XMM avx2 +cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_h_avx2_table + lea r5, [blend_h_avx2_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + lea maskq, [base+obmc_masks_avx2+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd m0, [dstq+dsq*0] + pinsrd m0, [dstq+dsq*1], 1 + movd m2, [maskq+hq*2] + movq m1, [tmpq] + add tmpq, 4*2 + punpcklwd m2, m2 + psubw m1, m0, m1 + pmulhrsw m1, m2 + paddw m0, m1 + movd [dstq+dsq*0], m0 + pextrd [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova m3, [blend_shuf] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + movd m2, [maskq+hq*2] + psubw m1, m0, [tmpq] + add tmpq, 8*2 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +INIT_YMM avx2 +.w8: + vbroadcasti128 m3, [blend_shuf] + shufpd m3, m3, 0x0c +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti128 m0, [dstq+dsq*1], 1 + vpbroadcastd m2, [maskq+hq*2] + psubw m1, m0, [tmpq] + add tmpq, 16*2 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + RET +.w16: + vpbroadcastw m4, [maskq+hq*2] + vpbroadcastw m5, [maskq+hq*2+2] + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 32*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 32*1] + add tmpq, 32*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16 + RET +.w32: + vpbroadcastw m4, [maskq+hq*2] + BLEND_H_ROW 0, 0, 2 + add dstq, dsq + inc hq + jl .w32 + RET +.w64: + vpbroadcastw m4, [maskq+hq*2] + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 4 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + vpbroadcastw m4, [maskq+hq*2] + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 8 + BLEND_H_ROW 4, -4 + BLEND_H_ROW 6, -2 + add dstq, dsq + inc hq + jl .w128 + RET + +cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ + bottomext, rightext + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor r12d, r12d + lea r10, [ihq-1] + cmp yq, ihq + cmovs r10, yq + test yq, yq + cmovs r10, r12 + imul r10, sstrideq + add srcq, r10 + + ; ref += iclip(x, 0, iw - 1) + lea r10, [iwq-1] + cmp xq, iwq + cmovs r10, xq + test xq, xq + cmovs r10, r12 + lea srcq, [srcq+r10*2] + + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) + lea bottomextq, [yq+bhq] + sub bottomextq, ihq + lea r3, [bhq-1] + cmovs bottomextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, r12 + cmp bottomextq, bhq + cmovns bottomextq, r3 + cmp topextq, bhq + cmovg topextq, r3 + + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + lea rightextq, [xq+bwq] + sub rightextq, iwq + lea r2, [bwq-1] + cmovs rightextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, r12 + cmp rightextq, bwq + cmovns rightextq, r2 + cmp leftextq, bwq + cmovns leftextq, r2 + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ + dst, dstride, src, sstride, bottomext, rightext + + ; center_h = bh - top_ext - bottom_ext + lea r3, [bottomextq+topextq] + sub centerhq, r3 + + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq + imul r2, dstrideq + add dstq, r2 + mov r9m, dstq + + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq + lea r3, [rightextq+leftextq] + sub centerwq, r3 + +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix +.v_loop_%3: +%if %1 + ; left extension + xor r3, r3 + vpbroadcastw m0, [srcq] +.left_loop_%3: + mova [dstq+r3*2], m0 + add r3, 16 + cmp r3, leftextq + jl .left_loop_%3 + + ; body + lea r12, [dstq+leftextq*2] +%endif + xor r3, r3 +.body_loop_%3: + movu m0, [srcq+r3*2] +%if %1 + movu [r12+r3*2], m0 +%else + movu [dstq+r3*2], m0 +%endif + add r3, 16 + cmp r3, centerwq + jl .body_loop_%3 + +%if %2 + ; right extension +%if %1 + lea r12, [r12+centerwq*2] +%else + lea r12, [dstq+centerwq*2] +%endif + xor r3, r3 + vpbroadcastw m0, [srcq+centerwq*2-2] +.right_loop_%3: + movu [r12+r3*2], m0 + add r3, 16 + cmp r3, rightextq + jl .right_loop_%3 + +%endif + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 +%endmacro + + test leftextq, leftextq + jnz .need_left_ext + test rightextq, rightextq + jnz .need_right_ext + v_loop 0, 0, 0 + jmp .body_done + +.need_left_ext: + test rightextq, rightextq + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: + ; bottom edge extension + test bottomextq, bottomextq + jz .top + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 +.bottom_x_loop: + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, bottomextq +.bottom_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .bottom_y_loop + add r1, 16 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end + mov srcq, r9m + mov dstq, dstm + xor r1, r1 +.top_x_loop: + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, topextq +.top_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .top_y_loop + add r1, 16 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0, pxmax + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + vpbroadcastd m5, dxm + vpbroadcastd m8, mx0m + vpbroadcastd m6, src_wm + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax + LEA r7, $$ +%define base r7-$$ + vpbroadcastd m3, [base+pd_64] + vpbroadcastw xm7, pxmaxm + pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] + pslld m5, 3 ; dx*8 + pslld m6, 14 + paddd m8, m2 ; mx+[0..7]*dx +.loop_y: + xor xd, xd + mova m4, m8 ; per-line working version of mx +.loop_x: + vpbroadcastd m10, [base+pd_63] + pxor m2, m2 + pmaxsd m0, m4, m2 + psrad m9, m4, 8 ; filter offset (unmasked) + pminsd m0, m6 ; iclip(mx, 0, src_w-8) + psubd m1, m4, m0 ; pshufb offset + psrad m0, 14 ; clipped src_x offset + psrad m1, 14 ; pshufb edge_emu offset + pand m9, m10 ; filter offset (masked) + ; load source pixels + movd r8d, xm0 + pextrd r9d, xm0, 1 + pextrd r10d, xm0, 2 + pextrd r11d, xm0, 3 + vextracti128 xm0, m0, 1 + movu xm10, [srcq+r8*2] + movu xm11, [srcq+r9*2] + movu xm12, [srcq+r10*2] + movu xm13, [srcq+r11*2] + movd r8d, xm0 + pextrd r9d, xm0, 1 + pextrd r10d, xm0, 2 + pextrd r11d, xm0, 3 + vinserti128 m10, [srcq+r8*2], 1 + vinserti128 m11, [srcq+r9*2], 1 + vinserti128 m12, [srcq+r10*2], 1 + vinserti128 m13, [srcq+r11*2], 1 + ptest m1, m1 + jz .filter + movq r9, xm1 + pextrq r11, xm1, 1 + movsxd r8, r9d + sar r9, 32 + movsxd r10, r11d + sar r11, 32 + vextracti128 xm1, m1, 1 + movu xm14, [base+resize_shuf+8+r8*2] + movu xm15, [base+resize_shuf+8+r9*2] + movu xm0, [base+resize_shuf+8+r10*2] + movu xm2, [base+resize_shuf+8+r11*2] + movq r9, xm1 + pextrq r11, xm1, 1 + movsxd r8, r9d + sar r9, 32 + movsxd r10, r11d + sar r11, 32 + vinserti128 m14, [base+resize_shuf+8+r8*2], 1 + vinserti128 m15, [base+resize_shuf+8+r9*2], 1 + vinserti128 m0, [base+resize_shuf+8+r10*2], 1 + vinserti128 m2, [base+resize_shuf+8+r11*2], 1 + pshufb m10, m14 + pshufb m11, m15 + pshufb m12, m0 + pshufb m13, m2 +.filter: + movd r8d, xm9 + pextrd r9d, xm9, 1 + pextrd r10d, xm9, 2 + pextrd r11d, xm9, 3 + vextracti128 xm9, m9, 1 + movq xm14, [base+resize_filter+r8*8] + movq xm15, [base+resize_filter+r9*8] + movq xm0, [base+resize_filter+r10*8] + movq xm2, [base+resize_filter+r11*8] + movd r8d, xm9 + pextrd r9d, xm9, 1 + pextrd r10d, xm9, 2 + pextrd r11d, xm9, 3 + movhps xm14, [base+resize_filter+r8*8] + movhps xm15, [base+resize_filter+r9*8] + movhps xm0, [base+resize_filter+r10*8] + movhps xm2, [base+resize_filter+r11*8] + pmovsxbw m14, xm14 + pmovsxbw m15, xm15 + pmovsxbw m0, xm0 + pmovsxbw m2, xm2 + pmaddwd m10, m14 + pmaddwd m11, m15 + pmaddwd m12, m0 + pmaddwd m13, m2 + phaddd m10, m11 + phaddd m12, m13 + phaddd m10, m12 + psubd m10, m3, m10 + psrad m10, 7 + vextracti128 xm0, m10, 1 + packusdw xm10, xm0 + pminsw xm10, xm7 + mova [dstq+xq*2], xm10 + paddd m4, m5 + add xd, 8 + cmp xd, dst_wd + jl .loop_x + add dstq, dst_strideq + add srcq, src_strideq + dec hd + jg .loop_y + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/mc16_avx512.asm b/third_party/dav1d/src/x86/mc16_avx512.asm new file mode 100644 index 0000000000..585ba53e08 --- /dev/null +++ b/third_party/dav1d/src/x86/mc16_avx512.asm @@ -0,0 +1,4858 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 + db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 +spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 + db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 + db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 + db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 +spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 + db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 +spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 + db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 + db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 + db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 +spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 + db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 + db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 + db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 +spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 + db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 + db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 + db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 +prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 + db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 + db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 +prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 + db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 + db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 + db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 +prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 + db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 + db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 + db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 +spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 + db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 +spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46 + db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 +spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 + db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 + db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 + db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 +spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 + db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 + db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 + db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 +spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 + db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 + db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 + db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 +spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 + db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 + db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 + db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 +spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 + db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 + db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 +spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 +spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 + db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 +w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 +w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 + db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 + db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 +w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 + db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 + db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 + db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 +w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 + db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 + db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 + db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 +w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 + db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 + db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 + db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 +warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 + db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 + db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 + db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 +warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 + db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 + db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 + db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 +warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 + db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 + db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 + db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 +deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 +pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 + dd 1 +pw_2048: times 2 dw 2048 + dd 3 +pw_8192: times 2 dw 8192 +avg_shift: dw 5, 5, 3, 3 +pw_27615: times 2 dw 27615 +pw_32766: times 2 dw 32766 +warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 +warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 +warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29 +resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31 +resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13 +resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15 +resize_permE: dq 0, 2, 4, 6 +resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13 +resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 + db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 + +prep_hv_shift: dq 6, 4 +put_bilin_h_rnd: dw 8, 8, 10, 10 +prep_mul: dw 16, 16, 4, 4 +put_8tap_h_rnd: dd 34, 40 +prep_8tap_rnd: dd 128 - (8192 << 8) +warp_8x8_rnd_h: dd 512, 2048 +warp_8x8_rnd_v: dd 262144, 65536 +warp_8x8t_rnd_v: dd 16384 - (8192 << 15) +avg_round: dw -16400, -16400, -16388, -16388 +w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) +mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) +w_mask_round: dd 128, 64 +bidir_shift: dw 6, 6, 4, 4 + +pb_64: times 4 db 64 +pw_m512: times 2 dw -512 +pw_2: times 2 dw 2 +pw_64: times 2 dw 64 +pd_32: dd 32 +pd_63: dd 63 +pd_128: dd 128 +pd_640: dd 640 +pd_2176: dd 2176 +pd_16384: dd 16384 +pd_0_4: dd 0, 4 + +%define pw_16 prep_mul +%define pd_512 warp_8x8_rnd_h + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) +%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) + +BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +cextern mc_warp_filter +cextern obmc_masks_avx2 +cextern resize_filter + +SECTION .text + +%if WIN64 +DECLARE_REG_TMP 4 +%else +DECLARE_REG_TMP 8 +%endif + +INIT_ZMM avx512icl +cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy + mov mxyd, r6m ; mx + lea r7, [put_avx512icl] + tzcnt t0d, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx t0d, word [r7+t0*2+table_offset(put,)] + add t0, r7 + jmp t0 +.put_w2: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu xmm0, [srcq+ssq*0] + movu xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], xmm0 + mova [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu ym0, [srcq+ssq*0] + movu ym1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], ym0 + mova [dstq+dsq*1], ym1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + movu m2, [srcq+ssq*1+64*0] + movu m3, [srcq+ssq*1+64*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+64*0], m0 + mova [dstq+dsq*0+64*1], m1 + mova [dstq+dsq*1+64*0], m2 + mova [dstq+dsq*1+64*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] + movu m2, [srcq+64*2] + movu m3, [srcq+64*3] + add srcq, ssq + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + mova [dstq+64*2], m2 + mova [dstq+64*3], m3 + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + vpbroadcastw m5, mxyd + mov mxyd, r7m ; my + vpbroadcastd m4, [pw_16] + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] + mov r6d, r8m ; bitdepth_max + add t0, r7 + shr r6d, 11 + vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] + jmp t0 +.h_w2: + movq xmm1, [srcq+ssq*0] + movhps xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw xmm0, xmm1, xm4 + psrlq xmm1, 16 + pmullw xmm1, xm5 + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 4 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq xmm0, [srcq+ssq*0+0] + movhps xmm0, [srcq+ssq*1+0] + movq xmm1, [srcq+ssq*0+2] + movhps xmm1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw xmm0, xm4 + pmullw xmm1, xm5 + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 4 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+ssq*0+0] + vinserti32x4 ym0, [srcq+ssq*1+0], 1 + movu xm1, [srcq+ssq*0+2] + vinserti32x4 ym1, [srcq+ssq*1+2], 1 + lea srcq, [srcq+ssq*2] + pmullw ym0, ym4 + pmullw ym1, ym5 + paddw ym0, ym6 + paddw ym0, ym1 + psrlw ym0, 4 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu ym0, [srcq+ssq*0+0] + vinserti32x8 m0, [srcq+ssq*1+0], 1 + movu ym1, [srcq+ssq*0+2] + vinserti32x8 m1, [srcq+ssq*1+2], 1 + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m6 + paddw m0, m1 + psrlw m0, 4 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + pmullw m0, m4, [srcq+ssq*0+0] + pmullw m2, m5, [srcq+ssq*0+2] + pmullw m1, m4, [srcq+ssq*1+0] + pmullw m3, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m0, m6 + paddw m1, m6 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32 + RET +.h_w64: + pmullw m0, m4, [srcq+64*0+0] + pmullw m2, m5, [srcq+64*0+2] + pmullw m1, m4, [srcq+64*1+0] + pmullw m3, m5, [srcq+64*1+2] + add srcq, ssq + paddw m0, m6 + paddw m1, m6 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + pmullw m0, m4, [srcq+64*0+0] + pmullw m7, m5, [srcq+64*0+2] + pmullw m1, m4, [srcq+64*1+0] + pmullw m8, m5, [srcq+64*1+2] + pmullw m2, m4, [srcq+64*2+0] + pmullw m9, m5, [srcq+64*2+2] + pmullw m3, m4, [srcq+64*3+0] + pmullw m10, m5, [srcq+64*3+2] + add srcq, ssq + REPX {paddw x, m6}, m0, m1, m2, m3 + paddw m0, m7 + paddw m1, m8 + paddw m2, m9 + paddw m3, m10 + REPX {psrlw x, 4}, m0, m1, m2, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + mova [dstq+64*2], m2 + mova [dstq+64*3], m3 + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] + shl mxyd, 11 + vpbroadcastw m8, mxyd + add t0, r7 + jmp t0 +.v_w2: + movd xmm0, [srcq+ssq*0] +.v_w2_loop: + movd xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpckldq xmm2, xmm0, xmm1 + movd xmm0, [srcq+ssq*0] + punpckldq xmm1, xmm0 + psubw xmm1, xmm2 + pmulhrsw xmm1, xm8 + paddw xmm1, xmm2 + movd [dstq+dsq*0], xmm1 + pextrd [dstq+dsq*1], xmm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xmm0, [srcq+ssq*0] +.v_w4_loop: + movq xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq xmm2, xmm0, xmm1 + movq xmm0, [srcq+ssq*0] + punpcklqdq xmm1, xmm0 + psubw xmm1, xmm2 + pmulhrsw xmm1, xm8 + paddw xmm1, xmm2 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu xmm0, [srcq+ssq*0] +.v_w8_loop: + vbroadcasti128 ymm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd ymm2, ymm0, ymm1, 0xf0 + vbroadcasti128 ymm0, [srcq+ssq*0] + vpblendd ymm1, ymm0, 0xf0 + psubw ymm1, ymm2 + pmulhrsw ymm1, ym8 + paddw ymm1, ymm2 + mova [dstq+dsq*0], xmm1 + vextracti128 [dstq+dsq*1], ymm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + vzeroupper + RET +.v_w16: + movu ym0, [srcq+ssq*0] +.v_w16_loop: + movu ym3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw ym1, ym3, ym0 + pmulhrsw ym1, ym8 + paddw ym1, ym0 + movu ym0, [srcq+ssq*0] + psubw ym2, ym0, ym3 + pmulhrsw ym2, ym8 + paddw ym2, ym3 + mova [dstq+dsq*0], ym1 + mova [dstq+dsq*1], ym2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: + movu m0, [srcq+ssq*0] +.v_w32_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m8 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m8 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + RET +.v_w64: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] +.v_w64_loop: + movu m2, [srcq+ssq*1+64*0] + movu m3, [srcq+ssq*1+64*1] + lea srcq, [srcq+ssq*2] + psubw m4, m2, m0 + pmulhrsw m4, m8 + paddw m4, m0 + movu m0, [srcq+ssq*0+64*0] + psubw m5, m3, m1 + pmulhrsw m5, m8 + paddw m5, m1 + movu m1, [srcq+ssq*0+64*1] + psubw m6, m0, m2 + pmulhrsw m6, m8 + psubw m7, m1, m3 + pmulhrsw m7, m8 + mova [dstq+dsq*0+64*0], m4 + mova [dstq+dsq*0+64*1], m5 + paddw m6, m2 + paddw m7, m3 + mova [dstq+dsq*1+64*0], m6 + mova [dstq+dsq*1+64*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + movu m2, [srcq+ssq*0+64*2] + movu m3, [srcq+ssq*0+64*3] +.v_w128_loop: + movu m4, [srcq+ssq*1+64*0] + movu m5, [srcq+ssq*1+64*1] + movu m6, [srcq+ssq*1+64*2] + movu m7, [srcq+ssq*1+64*3] + lea srcq, [srcq+ssq*2] + psubw m9, m4, m0 + pmulhrsw m9, m8 + paddw m9, m0 + movu m0, [srcq+ssq*0+64*0] + psubw m10, m5, m1 + pmulhrsw m10, m8 + paddw m10, m1 + movu m1, [srcq+ssq*0+64*1] + psubw m11, m6, m2 + pmulhrsw m11, m8 + paddw m11, m2 + movu m2, [srcq+ssq*0+64*2] + psubw m12, m7, m3 + pmulhrsw m12, m8 + paddw m12, m3 + movu m3, [srcq+ssq*0+64*3] + mova [dstq+dsq*0+64*0], m9 + psubw m9, m0, m4 + pmulhrsw m9, m8 + mova [dstq+dsq*0+64*1], m10 + psubw m10, m1, m5 + pmulhrsw m10, m8 + mova [dstq+dsq*0+64*2], m11 + psubw m11, m2, m6 + pmulhrsw m11, m8 + mova [dstq+dsq*0+64*3], m12 + psubw m12, m3, m7 + pmulhrsw m12, m8 + paddw m9, m4 + paddw m10, m5 + mova [dstq+dsq*1+64*0], m9 + mova [dstq+dsq*1+64*1], m10 + paddw m11, m6 + paddw m12, m7 + mova [dstq+dsq*1+64*2], m11 + mova [dstq+dsq*1+64*3], m12 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w128_loop + RET +.hv: + movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] + shl mxyd, 11 + vpbroadcastd m6, [pw_2] + vpbroadcastw m7, mxyd + vpbroadcastd m8, [pw_8192] + add t0, r7 + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + vpbroadcastd m8, [pw_2048] +.hv_12bpc: + jmp t0 +.hv_w2: + vpbroadcastq xmm1, [srcq+ssq*0] + pmullw xmm0, xmm1, xm4 + psrlq xmm1, 16 + pmullw xmm1, xm5 + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 2 +.hv_w2_loop: + movq xmm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xmm2, [srcq+ssq*0] + pmullw xmm1, xmm2, xm4 + psrlq xmm2, 16 + pmullw xmm2, xm5 + paddw xmm1, xm6 + paddw xmm1, xmm2 + psrlw xmm1, 2 ; 1 _ 2 _ + shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ + mova xmm0, xmm1 + psubw xmm1, xmm2 + paddw xmm1, xmm1 + pmulhw xmm1, xm7 + paddw xmm1, xmm2 + pmulhrsw xmm1, xm8 + movd [dstq+dsq*0], xmm1 + pextrd [dstq+dsq*1], xmm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + pmullw xmm0, xm4, [srcq+ssq*0-8] + pmullw xmm1, xm5, [srcq+ssq*0-6] + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 2 +.hv_w4_loop: + movq xmm1, [srcq+ssq*1+0] + movq xmm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps xmm1, [srcq+ssq*0+0] + movhps xmm2, [srcq+ssq*0+2] + pmullw xmm1, xm4 + pmullw xmm2, xm5 + paddw xmm1, xm6 + paddw xmm1, xmm2 + psrlw xmm1, 2 ; 1 2 + shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 + mova xmm0, xmm1 + psubw xmm1, xmm2 + paddw xmm1, xmm1 + pmulhw xmm1, xm7 + paddw xmm1, xmm2 + pmulhrsw xmm1, xm8 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + pmullw xmm0, xm4, [srcq+ssq*0+0] + pmullw xmm1, xm5, [srcq+ssq*0+2] + paddw xmm0, xm6 + paddw xmm0, xmm1 + psrlw xmm0, 2 + vinserti32x4 ym0, xmm0, 1 +.hv_w8_loop: + movu xm1, [srcq+ssq*1+0] + movu xm2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym1, [srcq+ssq*0+0], 1 + vinserti32x4 ym2, [srcq+ssq*0+2], 1 + pmullw ym1, ym4 + pmullw ym2, ym5 + paddw ym1, ym6 + paddw ym1, ym2 + psrlw ym1, 2 ; 1 2 + vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 + mova ym0, ym1 + psubw ym1, ym2 + paddw ym1, ym1 + pmulhw ym1, ym7 + paddw ym1, ym2 + pmulhrsw ym1, ym8 + mova [dstq+dsq*0], xm1 + vextracti32x4 [dstq+dsq*1], ym1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + pmullw ym0, ym4, [srcq+ssq*0+0] + pmullw ym1, ym5, [srcq+ssq*0+2] + paddw ym0, ym6 + paddw ym0, ym1 + psrlw ym0, 2 + vinserti32x8 m0, ym0, 1 +.hv_w16_loop: + movu ym1, [srcq+ssq*1+0] + movu ym2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + vinserti32x8 m1, [srcq+ssq*0+0], 1 + vinserti32x8 m2, [srcq+ssq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m6 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + vshufi32x4 m2, m0, m1, q1032 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m7 + paddw m1, m2 + pmulhrsw m1, m8 + mova [dstq+dsq*0], ym1 + vextracti32x8 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: +.hv_w64: +.hv_w128: + movifnidn wd, wm + lea r6d, [hq+wq*8-256] + mov r4, srcq + mov r7, dstq +.hv_w32_loop0: + pmullw m0, m4, [srcq+ssq*0+0] + pmullw m1, m5, [srcq+ssq*0+2] + paddw m0, m6 + paddw m0, m1 + psrlw m0, 2 +.hv_w32_loop: + pmullw m3, m4, [srcq+ssq*1+0] + pmullw m1, m5, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + paddw m3, m6 + paddw m3, m1 + psrlw m3, 2 + psubw m1, m3, m0 + paddw m1, m1 + pmulhw m1, m7 + paddw m1, m0 + pmullw m0, m4, [srcq+ssq*0+0] + pmullw m2, m5, [srcq+ssq*0+2] + paddw m0, m6 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m3 + paddw m2, m2 + pmulhw m2, m7 + paddw m2, m3 + pmulhrsw m1, m8 + pmulhrsw m2, m8 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w32_loop + add r4, 64 + add r7, 64 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w32_loop0 + RET + +cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep_avx512icl] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [r6+wq*2+table_offset(prep,)] + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] + add wq, r6 + shr r5d, 11 + vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq xmm0, [srcq+strideq*0] + movhps xmm0, [srcq+strideq*1] + vpbroadcastq ymm1, [srcq+strideq*2] + vpbroadcastq ymm2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm0, ymm1, 0x30 + vpblendd ymm0, ymm2, 0xc0 + pmullw ymm0, ym4 + psubw ymm0, ym5 + mova [tmpq], ymm0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + vzeroupper + RET +.prep_w8: + movu xm0, [srcq+strideq*0] + vinserti32x4 ym0, [srcq+strideq*1], 1 + vinserti32x4 m0, [srcq+strideq*2], 2 + vinserti32x4 m0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + psubw m0, m5 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 + movu ym1, [srcq+strideq*2] + vinserti32x8 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmullw m0, m4, [srcq+strideq*0] + pmullw m1, m4, [srcq+strideq*1] + pmullw m2, m4, [srcq+strideq*2] + pmullw m3, m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 4 + jg .prep_w32 + RET +.prep_w64: + pmullw m0, m4, [srcq+strideq*0+64*0] + pmullw m1, m4, [srcq+strideq*0+64*1] + pmullw m2, m4, [srcq+strideq*1+64*0] + pmullw m3, m4, [srcq+strideq*1+64*1] + lea srcq, [srcq+strideq*2] + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 2 + jg .prep_w64 + RET +.prep_w128: + pmullw m0, m4, [srcq+64*0] + pmullw m1, m4, [srcq+64*1] + pmullw m2, m4, [srcq+64*2] + pmullw m3, m4, [srcq+64*3] + add srcq, strideq + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + dec hd + jg .prep_w128 + RET +.h: + vpbroadcastw m5, mxyd + mov mxyd, r6m ; my + vpbroadcastd m4, [pw_16] + vpbroadcastd m6, [pw_32766] + psubw m4, m5 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m4, 2 + psllw m5, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + movu xm1, [srcq+strideq*0] + vinserti32x4 ym1, [srcq+strideq*2], 1 + movu xm2, [srcq+strideq*1] + vinserti32x4 ym2, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq ym0, ym1, ym2 + psrldq ym1, 2 + psrldq ym2, 2 + pmullw ym0, ym4 + punpcklqdq ym1, ym2 + pmullw ym1, ym5 + psubw ym0, ym6 + paddw ym0, ym1 + psraw ym0, 2 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4 + RET +.h_w8: + movu xm0, [srcq+strideq*0+0] + movu xm1, [srcq+strideq*0+2] + vinserti32x4 ym0, [srcq+strideq*1+0], 1 + vinserti32x4 ym1, [srcq+strideq*1+2], 1 + vinserti32x4 m0, [srcq+strideq*2+0], 2 + vinserti32x4 m1, [srcq+strideq*2+2], 2 + vinserti32x4 m0, [srcq+stride3q +0], 3 + vinserti32x4 m1, [srcq+stride3q +2], 3 + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m5 + psubw m0, m6 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8 + RET +.h_w16: + movu ym0, [srcq+strideq*0+0] + vinserti32x8 m0, [srcq+strideq*1+0], 1 + movu ym1, [srcq+strideq*0+2] + vinserti32x8 m1, [srcq+strideq*1+2], 1 + lea srcq, [srcq+strideq*2] + pmullw m0, m4 + pmullw m1, m5 + psubw m0, m6 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 2 + jg .h_w16 + RET +.h_w32: + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m2, m5, [srcq+strideq*0+2] + pmullw m1, m4, [srcq+strideq*1+0] + pmullw m3, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m0, m6 + psubw m1, m6 + paddw m0, m2 + paddw m1, m3 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + sub hd, 2 + jg .h_w32 + RET +.h_w64: + pmullw m0, m4, [srcq+ 0] + pmullw m2, m5, [srcq+ 2] + pmullw m1, m4, [srcq+64] + pmullw m3, m5, [srcq+66] + add srcq, strideq + psubw m0, m6 + psubw m1, m6 + paddw m0, m2 + paddw m1, m3 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + dec hd + jg .h_w64 + RET +.h_w128: + pmullw m0, m4, [srcq+ 0] + pmullw m7, m5, [srcq+ 2] + pmullw m1, m4, [srcq+ 64] + pmullw m8, m5, [srcq+ 66] + pmullw m2, m4, [srcq+128] + pmullw m9, m5, [srcq+130] + pmullw m3, m4, [srcq+192] + pmullw m10, m5, [srcq+194] + add srcq, strideq + REPX {psubw x, m6}, m0, m1, m2, m3 + paddw m0, m7 + paddw m1, m8 + paddw m2, m9 + paddw m3, m10 + REPX {psraw x, 2}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + dec hd + jg .h_w128 + RET +.v: + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + vpbroadcastw m9, mxyd + vpbroadcastd m8, [pw_16] + vpbroadcastd m10, [pw_32766] + add wq, r6 + lea stride3q, [strideq*3] + psubw m8, m9 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m8, 2 + psllw m9, 2 +.v_12bpc: + jmp wq +.v_w4: + movq xmm0, [srcq+strideq*0] +.v_w4_loop: + vpbroadcastq xmm2, [srcq+strideq*1] + vpbroadcastq ymm1, [srcq+strideq*2] + vpbroadcastq ymm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm2, ymm1, 0x30 + vpblendd ymm2, ymm3, 0xc0 + vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 + movq xmm0, [srcq+strideq*0] + valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 + pmullw ymm1, ym8 + pmullw ymm2, ym9 + psubw ymm1, ym10 + paddw ymm1, ymm2 + psraw ymm1, 2 + mova [tmpq], ymm1 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + movu xm0, [srcq+strideq*0] +.v_w8_loop: + vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 + vinserti32x4 m1, [srcq+strideq*2], 2 + vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 + lea srcq, [srcq+strideq*4] + movu xm0, [srcq+strideq*0] + valignq m2, m0, m1, 2 ; 1 2 3 4 + pmullw m1, m8 + pmullw m2, m9 + psubw m1, m10 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + movu ym0, [srcq+strideq*0] +.v_w16_loop: + vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 + movu ym3, [srcq+strideq*2] + vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 + lea srcq, [srcq+strideq*4] + movu ym0, [srcq+strideq*0] + vshufi32x4 m3, m1, m3, q1032 ; 1 2 + vshufi32x4 m4, m2, m0, q1032 ; 3 4 + pmullw m1, m8 + pmullw m2, m8 + pmullw m3, m9 + pmullw m4, m9 + psubw m1, m10 + psubw m2, m10 + paddw m1, m3 + paddw m2, m4 + psraw m1, 2 + psraw m2, 2 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + movu m0, [srcq+strideq*0] +.v_w32_loop: + movu m3, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m1, m8, m0 + movu m0, [srcq+strideq*0] + pmullw m2, m8, m3 + pmullw m3, m9 + pmullw m4, m9, m0 + psubw m1, m10 + psubw m2, m10 + paddw m1, m3 + paddw m2, m4 + psraw m1, 2 + psraw m2, 2 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + RET +.v_w64: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] +.v_w64_loop: + add srcq, strideq + pmullw m2, m8, m0 + movu m0, [srcq+64*0] + pmullw m3, m8, m1 + movu m1, [srcq+64*1] + pmullw m4, m9, m0 + pmullw m5, m9, m1 + psubw m2, m10 + psubw m3, m10 + paddw m2, m4 + paddw m3, m5 + psraw m2, 2 + psraw m3, 2 + mova [tmpq+64*0], m2 + mova [tmpq+64*1], m3 + add tmpq, 64*2 + dec hd + jg .v_w64_loop + RET +.v_w128: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] + movu m2, [srcq+64*2] + movu m3, [srcq+64*3] +.v_w128_loop: + add srcq, strideq + pmullw m4, m8, m0 + movu m0, [srcq+64*0] + pmullw m5, m8, m1 + movu m1, [srcq+64*1] + pmullw m6, m8, m2 + movu m2, [srcq+64*2] + pmullw m7, m8, m3 + movu m3, [srcq+64*3] + pmullw m11, m9, m0 + pmullw m12, m9, m1 + pmullw m13, m9, m2 + pmullw m14, m9, m3 + REPX {psubw x, m10}, m4, m5, m6, m7 + paddw m4, m11 + paddw m5, m12 + paddw m6, m13 + paddw m7, m14 + REPX {psraw x, 2}, m4, m5, m6, m7 + mova [tmpq+64*0], m4 + mova [tmpq+64*1], m5 + mova [tmpq+64*2], m6 + mova [tmpq+64*3], m7 + add tmpq, 64*4 + dec hd + jg .v_w128_loop + RET +.hv: + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + vpbroadcastw m7, mxyd + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.hv_w4: + movq xmm0, [srcq+strideq*0+0] + movq xmm1, [srcq+strideq*0+2] + pmullw xmm0, xm4 + pmullw xmm1, xm5 + psubw xmm0, xm6 + paddw xmm0, xmm1 + psraw xmm0, 2 + vpbroadcastq ym0, xmm0 +.hv_w4_loop: + movu xm1, [srcq+strideq*1] + vinserti128 ym1, [srcq+stride3q ], 1 + movu xm2, [srcq+strideq*2] + lea srcq, [srcq+strideq*4] + vinserti128 ym2, [srcq+strideq*0], 1 + punpcklqdq ym3, ym1, ym2 + psrldq ym1, 2 + psrldq ym2, 2 + pmullw ym3, ym4 + punpcklqdq ym1, ym2 + pmullw ym1, ym5 + psubw ym3, ym6 + paddw ym1, ym3 + psraw ym1, 2 ; 1 2 3 4 + valignq ym2, ym1, ym0, 3 ; 0 1 2 3 + mova ym0, ym1 + psubw ym1, ym2 + pmulhrsw ym1, ym7 + paddw ym1, ym2 + mova [tmpq], ym1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + pmullw xm0, xm4, [srcq+strideq*0+0] + pmullw xm1, xm5, [srcq+strideq*0+2] + psubw xm0, xm6 + paddw xm0, xm1 + psraw xm0, 2 + vinserti32x4 m0, xm0, 3 +.hv_w8_loop: + movu xm1, [srcq+strideq*1+0] + movu xm2, [srcq+strideq*1+2] + vinserti32x4 ym1, [srcq+strideq*2+0], 1 + vinserti32x4 ym2, [srcq+strideq*2+2], 1 + vinserti32x4 m1, [srcq+stride3q +0], 2 + vinserti32x4 m2, [srcq+stride3q +2], 2 + lea srcq, [srcq+strideq*4] + vinserti32x4 m1, [srcq+strideq*0+0], 3 + vinserti32x4 m2, [srcq+strideq*0+2], 3 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m6 + paddw m1, m2 + psraw m1, 2 ; 1 2 3 4 + valignq m2, m1, m0, 6 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m7 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + pmullw ym0, ym4, [srcq+strideq*0+0] + pmullw ym1, ym5, [srcq+strideq*0+2] + psubw ym0, ym6 + paddw ym0, ym1 + psraw ym0, 2 + vinserti32x8 m0, ym0, 1 +.hv_w16_loop: + movu ym1, [srcq+strideq*1+0] + movu ym2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + vinserti32x8 m1, [srcq+strideq*0+0], 1 + vinserti32x8 m2, [srcq+strideq*0+2], 1 + pmullw m1, m4 + pmullw m2, m5 + psubw m1, m6 + paddw m1, m2 + psraw m1, 2 ; 1 2 + vshufi32x4 m2, m0, m1, q1032 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m7 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m1, m5, [srcq+strideq*0+2] + psubw m0, m6 + paddw m0, m1 + psraw m0, 2 +.hv_w32_loop: + pmullw m3, m4, [srcq+strideq*1+0] + pmullw m1, m5, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + psubw m3, m6 + paddw m3, m1 + psraw m3, 2 + psubw m1, m3, m0 + pmulhrsw m1, m7 + paddw m1, m0 + pmullw m0, m4, [srcq+strideq*0+0] + pmullw m2, m5, [srcq+strideq*0+2] + psubw m0, m6 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m3 + pmulhrsw m2, m7 + paddw m2, m3 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 2 + jg .hv_w32_loop + RET +.hv_w64: + pmullw m0, m4, [srcq+ 0] + pmullw m2, m5, [srcq+ 2] + pmullw m1, m4, [srcq+64] + pmullw m3, m5, [srcq+66] + psubw m0, m6 + psubw m1, m6 + paddw m0, m2 + paddw m1, m3 + psraw m0, 2 + psraw m1, 2 +.hv_w64_loop: + add srcq, strideq + pmullw m2, m4, [srcq+ 0] + pmullw m8, m5, [srcq+ 2] + pmullw m3, m4, [srcq+64] + pmullw m9, m5, [srcq+66] + psubw m2, m6 + psubw m3, m6 + paddw m2, m8 + paddw m3, m9 + psraw m2, 2 + psraw m3, 2 + psubw m8, m2, m0 + psubw m9, m3, m1 + pmulhrsw m8, m7 + pmulhrsw m9, m7 + paddw m8, m0 + mova m0, m2 + paddw m9, m1 + mova m1, m3 + mova [tmpq+64*0], m8 + mova [tmpq+64*1], m9 + add tmpq, 64*2 + dec hd + jg .hv_w64_loop + RET +.hv_w128: + pmullw m0, m4, [srcq+ 0] + pmullw m8, m5, [srcq+ 2] + pmullw m1, m4, [srcq+ 64] + pmullw m9, m5, [srcq+ 66] + pmullw m2, m4, [srcq+128] + pmullw m10, m5, [srcq+130] + pmullw m3, m4, [srcq+192] + pmullw m11, m5, [srcq+194] + REPX {psubw x, m6}, m0, m1, m2, m3 + paddw m0, m8 + paddw m1, m9 + paddw m2, m10 + paddw m3, m11 + REPX {psraw x, 2}, m0, m1, m2, m3 +.hv_w128_loop: + add srcq, strideq + pmullw m8, m4, [srcq+ 0] + pmullw m12, m5, [srcq+ 2] + pmullw m9, m4, [srcq+ 64] + pmullw m13, m5, [srcq+ 66] + pmullw m10, m4, [srcq+128] + pmullw m14, m5, [srcq+130] + pmullw m11, m4, [srcq+192] + pmullw m15, m5, [srcq+194] + REPX {psubw x, m6}, m8, m9, m10, m11 + paddw m8, m12 + paddw m9, m13 + paddw m10, m14 + paddw m11, m15 + REPX {psraw x, 2}, m8, m9, m10, m11 + psubw m12, m8, m0 + psubw m13, m9, m1 + psubw m14, m10, m2 + psubw m15, m11, m3 + REPX {pmulhrsw x, m7}, m12, m13, m14, m15 + paddw m12, m0 + mova m0, m8 + paddw m13, m1 + mova m1, m9 + mova [tmpq+64*0], m12 + mova [tmpq+64*1], m13 + paddw m14, m2 + mova m2, m10 + paddw m15, m3 + mova m3, m11 + mova [tmpq+64*2], m14 + mova [tmpq+64*3], m15 + add tmpq, 64*4 + dec hd + jg .hv_w128_loop + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v +cglobal %1_8tap_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) +%endif +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%define buf rsp+stack_offset+8 ; shadow space +%else +DECLARE_REG_TMP 7, 8 +%define buf rsp-40 ; red zone +%endif + +MC_8TAP_FN put, sharp, SHARP, SHARP +MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN put, smooth, SMOOTH, SMOOTH +MC_8TAP_FN put, sharp_regular, SHARP, REGULAR +MC_8TAP_FN put, regular_sharp, REGULAR, SHARP +MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN put, regular, REGULAR, REGULAR + +cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my +%define base r8-put_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx512icl] + movifnidn wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 +%if WIN64 + pop r8 +%endif + jmp wq +.h_w2: + movzx mxd, mxb + sub srcq, 2 + mova ym2, [spel_h_shuf2a] + pmovsxbw xmm4, [base+subpel_filters+mxq*8] + pshufd xmm3, xmm4, q1111 + pshufd xmm4, xmm4, q2222 +.h_w2_loop: + movu xm1, [srcq+ssq*0] + vinserti32x4 ym1, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova xmm0, xm8 + vpermb ym1, ym2, ym1 + vpdpwssd xmm0, xmm3, xm1 + vextracti32x4 xm1, ym1, 1 + vpdpwssd xmm0, xmm4, xm1 + psrad xmm0, 6 + packusdw xmm0, xmm0 + pminsw xmm0, xm9 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + vbroadcasti32x4 ym4, [spel_h_shufA] + vbroadcasti32x4 ym5, [spel_h_shufB] + pshufd xmm0, xmm0, q2211 + vpbroadcastq ym6, xmm0 + vpermq ym7, ymm0, q1111 +.h_w4_loop: + movu xm2, [srcq+ssq*0] + vinserti32x4 ym2, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova ym0, ym8 + pshufb ym1, ym2, ym4 + vpdpwssd ym0, ym6, ym1 + pshufb ym2, ym5 + vpdpwssd ym0, ym7, ym2 + psrad ym0, 6 + vextracti32x4 xm1, ym0, 1 + packusdw xm0, xm1 + pminsw xmm0, xm0, xm9 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + mov r7d, r8m + vpbroadcastw m9, r8m + shr r7d, 11 + vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] + cmp wd, 4 + je .h_w4 + jl .h_w2 + shr mxd, 16 + sub srcq, 6 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mova [buf], xmm0 + vpbroadcastd m10, xmm0 + vpbroadcastd m11, [buf+ 4] + vpbroadcastd m12, [buf+ 8] + vpbroadcastd m13, [buf+12] + sub wd, 16 + je .h_w16 + jg .h_w32 +.h_w8: + mova m4, [spel_h_shufA] + movu m5, [spel_h_shufB] + movu m6, [spel_h_shufC] + mova m7, [spel_h_shufD] +.h_w8_loop: + movu ym2, [srcq+ssq*0] + vinserti32x8 m2, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + mova m0, m8 + vpermb m1, m4, m2 + vpdpwssd m0, m10, m1 + vpermb m1, m5, m2 + vpdpwssd m0, m11, m1 + vpermb m1, m6, m2 + vpdpwssd m0, m12, m1 + vpermb m1, m7, m2 + vpdpwssd m0, m13, m1 + psrad m0, 6 + vextracti32x8 ym1, m0, 1 + packusdw ym0, ym1 + pminsw ym0, ym9 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8_loop + RET +.h_w16: + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] +.h_w16_loop: + movu ym2, [srcq+ssq*0+ 0] + vinserti32x8 m2, [srcq+ssq*1+ 0], 1 + movu ym3, [srcq+ssq*0+16] + vinserti32x8 m3, [srcq+ssq*1+16], 1 + lea srcq, [srcq+ssq*2] + mova m0, m8 + mova m1, m8 + pshufb m4, m2, m6 + vpdpwssd m0, m10, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m12, m4 ; b2 + pshufb m4, m2, m7 + vpdpwssd m0, m11, m4 ; a1 + pshufb m4, m3, m7 + vpdpwssd m1, m13, m4 ; b3 + shufpd m2, m3, 0x55 + pshufb m4, m2, m6 + vpdpwssd m0, m12, m4 ; a2 + vpdpwssd m1, m10, m4 ; b0 + pshufb m2, m7 + vpdpwssd m0, m13, m2 ; a3 + vpdpwssd m1, m11, m2 ; b1 + psrad m0, 6 + psrad m1, 6 + packusdw m0, m1 + pminsw m0, m9 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + lea srcq, [srcq+wq*2] + vbroadcasti32x4 m6, [spel_h_shufA] + lea dstq, [dstq+wq*2] + vbroadcasti32x4 m7, [spel_h_shufB] + neg wq +.h_w32_loop0: + mov r6, wq +.h_w32_loop: + movu m2, [srcq+r6*2+ 0] + movu m3, [srcq+r6*2+ 8] + mova m0, m8 + mova m1, m8 + pshufb m4, m2, m6 + vpdpwssd m0, m10, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m10, m4 ; b0 + vpdpwssd m0, m12, m4 ; a2 + movu m4, [srcq+r6*2+16] + pshufb m3, m7 + vpdpwssd m1, m11, m3 ; b1 + vpdpwssd m0, m13, m3 ; a3 + pshufb m3, m4, m6 + vpdpwssd m1, m12, m3 ; b2 + pshufb m2, m7 + vpdpwssd m0, m11, m2 ; a1 + pshufb m4, m7 + vpdpwssd m1, m13, m4 ; b3 + psrad m0, 6 + psrad m1, 6 + packusdw m0, m1 + pminsw m0, m9 + mova [dstq+r6*2], m0 + add r6, 32 + jl .h_w32_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w32_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastd m10, [pd_32] + pmovsxbw xmm0, [base+subpel_filters+myq*8] + tzcnt r7d, wd + vpbroadcastw m11, r8m + lea r6, [ssq*3] + movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] + sub srcq, r6 + mova [rsp+stack_offset+8], xmm0 + vpbroadcastd m12, xmm0 + add r7, r8 + vpbroadcastd m13, [rsp+stack_offset+12] + vpbroadcastd m14, [rsp+stack_offset+16] + vpbroadcastd m15, [rsp+stack_offset+20] + jmp r7 +.v_w2: + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + pinsrd xmm2, [srcq+ssq*2], 2 + add srcq, r6 + pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, r6 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklwd xmm3, xmm1 ; 45 56 + punpcklwd xmm1, xmm2, xmm4 ; 01 12 + punpckhwd xmm2, xmm4 ; 23 34 +.v_w2_loop: + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xmm5, xm10 + vpdpwssd xmm5, xm12, xmm1 ; a0 b0 + mova xmm1, xmm2 + vpdpwssd xmm5, xm13, xmm2 ; a1 b1 + mova xmm2, xmm3 + vpdpwssd xmm5, xm14, xmm3 ; a2 b2 + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm0, 0x02 ; 7 8 + punpcklwd xmm3, xmm4 ; 67 78 + vpdpwssd xmm5, xm15, xmm3 ; a3 b3 + psrad xmm5, 6 + packusdw xmm5, xmm5 + pminsw xmm5, xm11 + movd [dstq+dsq*0], xmm5 + pextrd [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq xmm1, [srcq+ssq*0] + vpbroadcastq ymm0, [srcq+ssq*1] + vpbroadcastq ymm2, [srcq+ssq*2] + add srcq, r6 + vpbroadcastq ymm4, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm5, [srcq+ssq*2] + add srcq, r6 + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklwd ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm2, ymm4, 0x30 + vpblendd ymm4, ymm3, 0x30 + punpcklwd ymm2, ymm4 ; 23 34 + vpblendd ymm3, ymm5, 0x30 + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 45 56 +.v_w4_loop: + vpbroadcastq ymm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova ymm4, ym10 + vpdpwssd ymm4, ym12, ymm1 ; a0 b0 + mova ymm1, ymm2 + vpdpwssd ymm4, ym13, ymm2 ; a1 b1 + mova ymm2, ymm3 + vpdpwssd ymm4, ym14, ymm3 ; a2 b2 + vpblendd ymm3, ymm0, ymm5, 0x30 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 67 78 + vpdpwssd ymm4, ym15, ymm3 ; a3 b3 + psrad ymm4, 6 + vextracti128 xmm5, ymm4, 1 + packusdw xmm4, xmm5 + pminsw xmm4, xm11 + movq [dstq+dsq*0], xmm4 + movhps [dstq+dsq*1], xmm4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + vbroadcasti32x4 m2, [srcq+ssq*2] + vinserti32x4 m1, m2, [srcq+ssq*0], 0 + vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 + add srcq, r6 + vinserti32x4 ym2, [srcq+ssq*0], 1 + vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 + mova m6, [spel_v_shuf8] + movu xm0, [srcq+ssq*1] + vinserti32x4 ym0, [srcq+ssq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 + vpermb m1, m6, m1 ; 01 12 + vpermb m2, m6, m2 ; 23 34 + vpermb m3, m6, m0 ; 45 56 +.v_w8_loop: + vinserti32x4 m0, [srcq+ssq*1], 3 + lea srcq, [srcq+ssq*2] + movu xm5, [srcq+ssq*0] + mova m4, m10 + vpdpwssd m4, m12, m1 ; a0 b0 + mova m1, m2 + vshufi32x4 m0, m5, q1032 ; 6 7 8 + vpdpwssd m4, m13, m2 ; a1 b1 + mova m2, m3 + vpdpwssd m4, m14, m3 ; a2 b2 + vpermb m3, m6, m0 ; 67 78 + vpdpwssd m4, m15, m3 ; a3 b3 + psrad m4, 6 + vextracti32x8 ym5, m4, 1 + packusdw ym4, ym5 + pminsw ym4, ym11 + mova [dstq+dsq*0], xm4 + vextracti32x4 [dstq+dsq*1], ym4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti32x8 m1, [srcq+ssq*1] + vinserti32x8 m0, m1, [srcq+ssq*0], 0 + vinserti32x8 m1, [srcq+ssq*2], 1 + mova m8, [spel_v_shuf16] + add srcq, r6 + movu ym3, [srcq+ssq*0] + vinserti32x8 m3, [srcq+ssq*1], 1 + movu ym5, [srcq+ssq*2] + add srcq, r6 + vinserti32x8 m5, [srcq+ssq*0], 1 + vpermb m0, m8, m0 ; 01 + vpermb m1, m8, m1 ; 12 + vpermb m3, m8, m3 ; 34 + vpermb m5, m8, m5 ; 56 + mova m9, [deint_q_shuf] + vpshrdd m2, m1, m3, 16 ; 23 + vpshrdd m4, m3, m5, 16 ; 45 +.v_w16_loop: + mova m6, m10 + mova m7, m10 + vpdpwssd m6, m12, m0 ; a0 + mova m0, m2 + vpdpwssd m7, m12, m1 ; b0 + mova m1, m3 + vpdpwssd m6, m13, m2 ; a1 + mova m2, m4 + vpdpwssd m7, m13, m3 ; b1 + mova m3, m5 + vpdpwssd m6, m14, m4 ; a2 + mova m4, m5 + vpdpwssd m7, m14, m5 ; b2 + movu ym5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m5, [srcq+ssq*0], 1 + vpermb m5, m8, m5 ; 78 + vpshrdd m4, m5, 16 ; 67 + vpdpwssd m6, m15, m4 ; a3 + vpdpwssd m7, m15, m5 ; b3 + psrad m6, 6 + psrad m7, 6 + packusdw m6, m7 + pminsw m6, m11 + vpermq m6, m9, m6 + mova [dstq+dsq*0], ym6 + vextracti32x8 [dstq+dsq*1], m6, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + movaps [rsp+stack_offset+8], xmm6 +%endif + lea wd, [hq+wq*8-256] + mov r7, srcq + mov r8, dstq +.v_w32_loop0: + movu m16, [srcq+ssq*0] + movu m17, [srcq+ssq*1] + movu m18, [srcq+ssq*2] + add srcq, r6 + movu m19, [srcq+ssq*0] + movu m20, [srcq+ssq*1] + movu m21, [srcq+ssq*2] + add srcq, r6 + movu m22, [srcq+ssq*0] + punpcklwd m0, m16, m17 ; 01l + punpckhwd m16, m17 ; 01h + punpcklwd m1, m17, m18 ; 12l + punpckhwd m17, m18 ; 12h + punpcklwd m2, m18, m19 ; 23l + punpckhwd m18, m19 ; 23h + punpcklwd m3, m19, m20 ; 34l + punpckhwd m19, m20 ; 34h + punpcklwd m4, m20, m21 ; 45l + punpckhwd m20, m21 ; 45h + punpcklwd m5, m21, m22 ; 56l + punpckhwd m21, m22 ; 56h +.v_w32_loop: + mova m6, m10 + vpdpwssd m6, m12, m0 ; a0l + mova m8, m10 + vpdpwssd m8, m12, m16 ; a0h + mova m7, m10 + vpdpwssd m7, m12, m1 ; b0l + mova m9, m10 + vpdpwssd m9, m12, m17 ; b0h + mova m0, m2 + vpdpwssd m6, m13, m2 ; a1l + mova m16, m18 + vpdpwssd m8, m13, m18 ; a1h + mova m1, m3 + vpdpwssd m7, m13, m3 ; b1l + mova m17, m19 + vpdpwssd m9, m13, m19 ; b1h + mova m2, m4 + vpdpwssd m6, m14, m4 ; a2l + mova m18, m20 + vpdpwssd m8, m14, m20 ; a2h + mova m3, m5 + vpdpwssd m7, m14, m5 ; b2l + mova m19, m21 + vpdpwssd m9, m14, m21 ; b2h + movu m21, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m4, m22, m21 ; 67l + punpckhwd m20, m22, m21 ; 67h + movu m22, [srcq+ssq*0] + vpdpwssd m6, m15, m4 ; a3l + vpdpwssd m8, m15, m20 ; a3h + punpcklwd m5, m21, m22 ; 78l + punpckhwd m21, m22 ; 78h + vpdpwssd m7, m15, m5 ; b3l + vpdpwssd m9, m15, m21 ; b3h + REPX {psrad x, 6}, m6, m8, m7, m9 + packusdw m6, m8 + packusdw m7, m9 + pminsw m6, m11 + pminsw m7, m11 + mova [dstq+dsq*0], m6 + mova [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + add r7, 64 + add r8, 64 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .v_w32_loop0 +%if WIN64 + movaps xmm6, [rsp+stack_offset+8] +%endif + vzeroupper + RET +.hv: + vpbroadcastw m11, r8m + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 + test dword r8m, 0x800 + jnz .hv_12bit + vpbroadcastd m10, [pd_2176] + psllw xmm0, 6 + jmp .hv_main +.hv_12bit: + vpbroadcastd m10, [pd_640] + psllw xmm0, 4 + psllw xmm1, 2 +.hv_main: + mova [buf+ 0], xmm0 + mova [buf+16], xmm1 + vpbroadcastd m8, [buf+ 4] + vpbroadcastd m9, [buf+ 8] + vpbroadcastd ym12, xmm1 + vpbroadcastd ym13, [buf+20] + vpbroadcastd ym14, [buf+24] + vpbroadcastd ym15, [buf+28] + movu xm4, [srcq+ssq*0] + vinserti32x4 ym4, [srcq+ssq*1], 1 + vinserti32x4 m4, [srcq+ssq*2], 2 + add srcq, r6 + vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 + movu xm0, [srcq+ssq*1] + vinserti32x4 ym0, [srcq+ssq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 + cmp wd, 4 + je .hv_w4 + vbroadcasti32x4 m2, [spel_h_shufA] + mova m3, [spel_h_shuf2b] + mova ym6, [spel_h_shuf2a] + mova xm7, [spel_shuf2] + mova m1, m10 + pshufb m4, m2 + pshufb m0, m2 + punpcklqdq m2, m4, m0 + vpdpwssd m1, m8, m2 ; 04 15 26 3_ + punpckhqdq m4, m0 + vpdpwssd m1, m9, m4 + vpermb m1, m3, m1 ; 01 12 + vextracti32x4 xm2, ym1, 1 ; 23 34 + vextracti32x4 xm3, m1, 2 ; 45 56 +.hv_w2_loop: + movu xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym5, [srcq+ssq*0], 1 + mova xm4, xm10 + vpermb ym5, ym6, ym5 + pmaddwd xmm0, xm12, xm1 ; a0 b0 + vpdpwssd xm4, xm8, xm5 + vextracti32x4 xm5, ym5, 1 + mova xm1, xm2 + vpdpwssd xmm0, xm13, xm2 ; a1 b1 + vpdpwssd xm4, xm9, xm5 ; 7 8 + mova xm2, xm3 + vpdpwssd xmm0, xm14, xm3 ; a2 b2 + vpermt2b xm3, xm7, xm4 ; 67 78 + vpdpwssd xmm0, xm15, xm3 ; a3 b3 + psrad xmm0, 10 + packusdw xmm0, xmm0 + pminsw xmm0, xm11 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + vbroadcasti32x4 m19, [spel_h_shufA] + vbroadcasti32x4 m20, [spel_h_shufB] + mova ym6, [spel_shuf4a] + mova ym7, [spel_shuf4b] + mova m2, m10 + mova m3, m10 + pshufb m1, m4, m19 + vpdpwssd m2, m8, m1 + pshufb m1, m0, m19 + vpdpwssd m3, m8, m1 + pshufb m4, m20 + vpdpwssd m2, m9, m4 + pshufb m0, m20 + vpdpwssd m3, m9, m0 + vpermb m1, m6, m2 ; 01 12 + vshufi32x4 m2, m3, q1032 + vpermb m3, m6, m3 ; 45 56 + vpermb m2, m6, m2 ; 23 34 +.hv_w4_loop: + movu xm18, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 ym18, [srcq+ssq*0], 1 + mova ym4, ym10 + pshufb ym17, ym18, ym19 + pmaddwd ym16, ym12, ym1 ; a0 b0 + vpdpwssd ym4, ym8, ym17 + pshufb ym18, ym20 + mova ym1, ym2 + vpdpwssd ym16, ym13, ym2 ; a1 b1 + vpdpwssd ym4, ym9, ym18 ; 7 8 + mova ym2, ym3 + vpdpwssd ym16, ym14, ym3 ; a2 b2 + vpermt2b ym3, ym7, ym4 ; 67 78 + vpdpwssd ym16, ym15, ym3 ; a3 b3 + psrad ym16, 10 + vextracti128 xm17, ym16, 1 + packusdw xm16, xm17 + pminsw xm16, xm11 + movq [dstq+dsq*0], xm16 + movhps [dstq+dsq*1], xm16 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + vzeroupper + RET +.hv_w8: + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + test dword r8m, 0x800 + jnz .hv_w8_12bit + vpbroadcastd m10, [pd_2176] + psllw xmm0, 6 + jmp .hv_w8_main +.hv_w8_12bit: + vpbroadcastd m10, [pd_640] + psllw xmm0, 4 + psllw xmm1, 2 +.hv_w8_main: + mova [buf+ 0], xmm0 + mova [buf+16], xmm1 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [buf+ 4] + vpbroadcastd m14, [buf+ 8] + vpbroadcastd m15, [buf+12] + vpbroadcastd m16, xmm1 + vpbroadcastd m17, [buf+20] + vpbroadcastd m18, [buf+24] + vpbroadcastd m19, [buf+28] + cmp wd, 16 + je .hv_w16 + jg .hv_w32 + mova m5, [spel_h_shufA] + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 + movu ym9, [srcq+ssq*2] + add srcq, r6 + vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 + movu ym20, [srcq+ssq*1] + vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 + add srcq, r6 + movu ym21, [srcq+ssq*0] ; 6 + movu m6, [spel_h_shufB] + movu m7, [spel_h_shufC] + vpermb m8, m5, m0 + mova m1, m10 + vpdpwssd m1, m12, m8 ; a0 b0 + vpermb m8, m5, m9 + mova m2, m10 + vpdpwssd m2, m12, m8 ; c0 d0 + vpermb m8, m5, m20 + mova m3, m10 + vpdpwssd m3, m12, m8 ; e0 f0 + vpermb m8, m5, m21 + mova m4, m10 + vpdpwssd m4, m12, m8 ; g0 + vpermb m8, m6, m0 + vpdpwssd m1, m13, m8 ; a1 b1 + vpermb m8, m6, m9 + vpdpwssd m2, m13, m8 ; c1 d1 + vpermb m8, m6, m20 + vpdpwssd m3, m13, m8 ; e1 f1 + vpermb m8, m6, m21 + vpdpwssd m4, m13, m8 ; g1 + vpermb m8, m7, m0 + vpdpwssd m1, m14, m8 ; a2 b2 + vpermb m8, m7, m9 + vpdpwssd m2, m14, m8 ; c2 d2 + vpermb m8, m7, m20 + vpdpwssd m3, m14, m8 ; e2 f2 + vpermb m8, m7, m21 + vpdpwssd m4, m14, m8 ; g2 + mova m8, [spel_h_shufD] + vpermb m0, m8, m0 + vpdpwssd m1, m15, m0 ; a3 b3 + mova m0, [spel_shuf8a] + vpermb m9, m8, m9 + vpdpwssd m2, m15, m9 ; c3 d3 + mova m9, [spel_shuf8b] + vpermb m20, m8, m20 + vpdpwssd m3, m15, m20 ; e3 f3 + vpermb m21, m8, m21 + vpdpwssd m4, m15, m21 ; g3 + vpermt2b m1, m0, m2 ; 01 12 + vpermt2b m2, m0, m3 ; 23 34 + vpermt2b m3, m0, m4 ; 45 56 +.hv_w8_loop: + movu ym0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m0, [srcq+ssq*0], 1 + mova m4, m10 + vpermb m21, m5, m0 + vpdpwssd m4, m12, m21 ; h0 i0 + vpermb m21, m6, m0 + pmaddwd m20, m16, m1 ; A0 B0 + vpdpwssd m4, m13, m21 ; h1 i1 + vpermb m21, m7, m0 + mova m1, m2 + vpdpwssd m20, m17, m2 ; A1 B1 + vpdpwssd m4, m14, m21 ; h2 i2 + vpermb m21, m8, m0 + mova m2, m3 + vpdpwssd m20, m18, m3 ; A2 B2 + vpdpwssd m4, m15, m21 ; h3 i3 + vpermt2b m3, m9, m4 ; 67 78 + vpdpwssd m20, m19, m3 ; A3 B3 + psrad m20, 10 + vextracti32x8 ym21, m20, 1 + packusdw ym20, ym21 + pminsw ym20, ym11 + mova [dstq+dsq*0], xm20 + vextracti128 [dstq+dsq*1], ym20, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + vzeroupper + RET +.hv_w16: + WIN64_SPILL_XMM 26 + vbroadcasti32x8 m5, [srcq+ssq*0+ 8] + vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 + vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 + movu ym6, [srcq+ssq*1+ 0] + movu ym7, [srcq+ssq*1+16] + vinserti32x8 m6, [srcq+ssq*2+ 0], 1 + vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 + add srcq, r6 + movu ym22, [srcq+ssq*0+ 0] + movu ym23, [srcq+ssq*0+16] + vinserti32x8 m22, [srcq+ssq*1+ 0], 1 + vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4 + movu ym24, [srcq+ssq*2+ 0] + movu ym25, [srcq+ssq*2+16] + add srcq, r6 + vinserti32x8 m24, [srcq+ssq*0+ 0], 1 + vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6 + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m9, [spel_shuf16] + pshufb m0, m4, m20 + mova m1, m10 + vpdpwssd m1, m12, m0 ; a0 + pshufb m0, m6, m20 + mova m2, m10 + vpdpwssd m2, m12, m0 ; b0 + pshufb m0, m7, m20 + mova m3, m10 + vpdpwssd m3, m14, m0 ; c2 + pshufb m0, m4, m21 + vpdpwssd m1, m13, m0 ; a1 + pshufb m0, m6, m21 + vpdpwssd m2, m13, m0 ; b1 + pshufb m0, m7, m21 + vpdpwssd m3, m15, m0 ; c3 + pshufb m0, m5, m20 + vpdpwssd m1, m14, m0 ; a2 + shufpd m6, m7, 0x55 + pshufb m7, m6, m20 + vpdpwssd m2, m14, m7 ; b2 + vpdpwssd m3, m12, m7 ; c0 + pshufb m5, m21 + vpdpwssd m1, m15, m5 ; a3 + pshufb m6, m21 + vpdpwssd m2, m15, m6 ; b3 + vpdpwssd m3, m13, m6 ; c1 + pshufb m0, m22, m20 + mova m4, m10 + vpdpwssd m4, m12, m0 ; d0 + pshufb m0, m23, m20 + mova m5, m10 + vpdpwssd m5, m14, m0 ; e2 + pshufb m0, m24, m20 + mova m6, m10 + vpdpwssd m6, m12, m0 ; f0 + pshufb m0, m25, m20 + mova m7, m10 + vpdpwssd m7, m14, m0 ; g2 + pshufb m0, m22, m21 + vpdpwssd m4, m13, m0 ; d1 + pshufb m0, m23, m21 + vpdpwssd m5, m15, m0 ; e3 + pshufb m0, m24, m21 + vpdpwssd m6, m13, m0 ; f1 + pshufb m0, m25, m21 + vpdpwssd m7, m15, m0 ; g3 + shufpd m22, m23, 0x55 + pshufb m23, m22, m20 + vpdpwssd m4, m14, m23 ; d2 + vpdpwssd m5, m12, m23 ; e0 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m6, m14, m25 ; f2 + vpdpwssd m7, m12, m25 ; g0 + pshufb m22, m21 + vpdpwssd m4, m15, m22 ; d3 + vpdpwssd m5, m13, m22 ; e1 + pshufb m24, m21 + vpdpwssd m6, m15, m24 ; f3 + vpdpwssd m7, m13, m24 ; g1 + pslldq m1, 1 + vpermt2b m2, m9, m3 ; 12 + vpermt2b m4, m9, m5 ; 34 + vpermt2b m6, m9, m7 ; 56 + vpshrdd m1, m2, 16 ; 01 + vpshrdd m3, m2, m4, 16 ; 23 + vpshrdd m5, m4, m6, 16 ; 45 +.hv_w16_loop: + movu ym24, [srcq+ssq*1+ 0] + movu ym25, [srcq+ssq*1+16] + lea srcq, [srcq+ssq*2] + vinserti32x8 m24, [srcq+ssq*0+ 0], 1 + vinserti32x8 m25, [srcq+ssq*0+16], 1 + mova m7, m10 + mova m8, m10 + pshufb m0, m24, m20 + vpdpwssd m7, m12, m0 ; h0 + pshufb m0, m25, m20 + vpdpwssd m8, m14, m0 ; i2 + pmaddwd m22, m16, m1 ; A0 + mova m1, m3 + pmaddwd m23, m16, m2 ; B0 + mova m2, m4 + pshufb m0, m24, m21 + vpdpwssd m7, m13, m0 ; h1 + pshufb m0, m25, m21 + vpdpwssd m8, m15, m0 ; i3 + vpdpwssd m22, m17, m3 ; A1 + mova m3, m5 + vpdpwssd m23, m17, m4 ; B1 + mova m4, m6 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m7, m14, m25 ; h2 + vpdpwssd m8, m12, m25 ; i0 + vpdpwssd m22, m18, m5 ; A2 + vpdpwssd m23, m18, m6 ; B2 + pshufb m24, m21 + vpdpwssd m7, m15, m24 ; h3 + vpdpwssd m8, m13, m24 ; i1 + vpermt2b m7, m9, m8 ; 78 + vpshrdd m5, m6, m7, 16 ; 67 + vpdpwssd m22, m19, m5 ; A3 + vpdpwssd m23, m19, m7 ; B3 + mova m6, m7 + psrad m22, 10 + psrad m23, 10 + vshufi32x4 m0, m22, m23, q3232 + vinserti32x8 m22, ym23, 1 + packusdw m22, m0 + pminsw m22, m11 + mova [dstq+dsq*0], ym22 + vextracti32x8 [dstq+dsq*1], m22, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 32 + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m22, [spel_shuf32] + lea wd, [hq+wq*8-256] + mov r7, srcq + mov r8, dstq +.hv_w32_loop0: + movu m6, [srcq+ssq*0+ 0] + movu m7, [srcq+ssq*0+ 8] + movu m8, [srcq+ssq*0+16] + mova m0, m10 + mova m23, m10 + pshufb m9, m6, m20 + vpdpwssd m0, m12, m9 ; a0l + pshufb m9, m7, m20 + vpdpwssd m23, m12, m9 ; a0h + vpdpwssd m0, m14, m9 ; a2l + pshufb m7, m21 + vpdpwssd m23, m13, m7 ; a1h + vpdpwssd m0, m15, m7 ; a3l + pshufb m7, m8, m20 + vpdpwssd m23, m14, m7 ; a2h + pshufb m6, m21 + vpdpwssd m0, m13, m6 ; a1l + pshufb m8, m21 + vpdpwssd m23, m15, m8 ; a3h +%macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2] + movu m6, [srcq+%3*%4+ 0] + movu m7, [srcq+%3*%4+ 8] + movu m8, [srcq+%3*%4+16] +%if %4 == 2 + add srcq, r6 +%endif + movu m29, [srcq+%3*%5+ 0] + movu m30, [srcq+%3*%5+ 8] + movu m31, [srcq+%3*%5+16] +%if %5 == 2 + add srcq, r6 +%endif + mova m%1, m10 + mova m9, m10 + pshufb m%2, m6, m20 + vpdpwssd m%1, m12, m%2 ; x0l + pshufb m%2, m29, m20 + vpdpwssd m9, m12, m%2 ; y0l + pshufb m6, m21 + vpdpwssd m%1, m13, m6 ; x1l + pshufb m29, m21 + vpdpwssd m9, m13, m29 ; y1l + pshufb m6, m7, m20 + mova m%2, m10 + vpdpwssd m%2, m12, m6 ; x0h + pshufb m29, m30, m20 + vpdpwssd m%1, m14, m6 ; y2l + mova m6, m10 + vpdpwssd m6, m12, m29 ; x0h + pshufb m7, m21 + vpdpwssd m9, m14, m29 ; y2l + pshufb m30, m21 + vpdpwssd m%2, m13, m7 ; x1h + vpdpwssd m%1, m15, m7 ; x3l + pshufb m7, m8, m20 + vpdpwssd m6, m13, m30 ; y1h + vpdpwssd m9, m15, m30 ; y3l + pshufb m30, m31, m20 + vpdpwssd m%2, m14, m7 ; x2h + pshufb m8, m21 + vpdpwssd m6, m14, m30 ; y2h + pshufb m31, m21 + vpdpwssd m%2, m15, m8 ; x3h + vpdpwssd m6, m15, m31 ; y3h +%if %1 == 1 + vpermt2b m0, m22, m%1 ; 01l + vpermt2b m23, m22, m%2 ; 01h +%endif + vpermt2b m%1, m22, m9 ; xyl + vpermt2b m%2, m22, m6 ; xyh +%endmacro + PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12 + PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34 + PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56 + vpshrdd m2, m1, m3, 16 ; 23l + vpshrdd m25, m24, m26, 16 ; 23h + vpshrdd m4, m3, m5, 16 ; 45l + vpshrdd m27, m26, m28, 16 ; 45h +.hv_w32_loop: + movu m7, [srcq+ssq*1+ 0] + movu m9, [srcq+ssq*2+ 0] + movu m6, [srcq+ssq*1+ 8] + movu m8, [srcq+ssq*2+ 8] + mova m29, m10 + mova m31, m10 + pshufb m30, m7, m20 + vpdpwssd m29, m12, m30 ; h0l + pshufb m30, m9, m20 + vpdpwssd m31, m12, m30 ; i0l + pshufb m7, m21 + vpdpwssd m29, m13, m7 ; h1l + pshufb m9, m21 + vpdpwssd m31, m13, m9 ; i1l + pshufb m7, m6, m20 + vpdpwssd m29, m14, m7 ; h2l + pshufb m9, m8, m20 + vpdpwssd m31, m14, m9 ; i2l + pshufb m6, m21 + vpdpwssd m29, m15, m6 ; h3l + pshufb m8, m21 + vpdpwssd m31, m15, m8 ; i3l + mova m30, m10 + vpdpwssd m30, m12, m7 ; h0h + movu m7, [srcq+ssq*1+16] + lea srcq, [srcq+ssq*2] + vpermt2b m29, m22, m31 ; 78l + mova m31, m10 + vpdpwssd m31, m12, m9 ; i0h + movu m9, [srcq+ssq*0+16] + vpdpwssd m30, m13, m6 ; h1h + pshufb m6, m7, m20 + vpdpwssd m31, m13, m8 ; i1h + pshufb m8, m9, m20 + vpdpwssd m30, m14, m6 ; h2h + pmaddwd m6, m16, m0 ; A0l + pshufb m7, m21 + vpdpwssd m31, m14, m8 ; i2h + pmaddwd m8, m16, m23 ; A0h + pshufb m9, m21 + vpdpwssd m30, m15, m7 ; h3h + pmaddwd m7, m16, m1 ; B0l + vpdpwssd m31, m15, m9 ; i3h + pmaddwd m9, m16, m24 ; B0h + mova m0, m2 + vpdpwssd m6, m17, m2 ; A1l + mova m23, m25 + vpdpwssd m8, m17, m25 ; A1h + mova m1, m3 + vpdpwssd m7, m17, m3 ; B1l + mova m24, m26 + vpdpwssd m9, m17, m26 ; B1h + vpermt2b m30, m22, m31 ; 78h + vpdpwssd m6, m18, m4 ; A2l + mova m2, m4 + vpdpwssd m8, m18, m27 ; A2h + mova m25, m27 + vpdpwssd m7, m18, m5 ; B2l + mova m3, m5 + vpdpwssd m9, m18, m28 ; B2h + mova m26, m28 + vpshrdd m4, m5, m29, 16 ; 67l + vpdpwssd m6, m19, m4 ; A3l + vpshrdd m27, m28, m30, 16 ; 67h + vpdpwssd m8, m19, m27 ; A3h + mova m5, m29 + vpdpwssd m7, m19, m29 ; B3l + mova m28, m30 + vpdpwssd m9, m19, m30 ; B3h + REPX {psrad x, 10}, m6, m8, m7, m9 + packusdw m6, m8 + packusdw m7, m9 + pminsw m6, m11 + pminsw m7, m11 + mova [dstq+dsq*0], m6 + mova [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w32_loop + add r7, 64 + add r8, 64 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 + jg .hv_w32_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +MC_8TAP_FN prep, sharp, SHARP, SHARP +MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH +MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP +MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH +MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR +MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP +MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR +MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH +MC_8TAP_FN prep, regular, REGULAR, REGULAR + +cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 +%define base r7-prep_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep_avx512icl] + mov wd, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov r5d, r7m ; bitdepth_max + vpbroadcastd m5, [pw_8192] + movzx wd, word [r7+wq*2+table_offset(prep,)] + shr r5d, 11 + vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] + add wq, r7 + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h_w4: + movzx mxd, mxb + sub srcq, 2 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mov r5d, r7m + vbroadcasti32x4 m4, [spel_h_shufA] + vbroadcasti32x4 m5, [spel_h_shufB] + shr r5d, 11 + mova ym9, [prep_endA] + psllw xmm0, [base+prep_hv_shift+r5*8] + mova [tmpq], xmm0 + vpbroadcastd m6, [tmpq+4] + vpbroadcastd m7, [tmpq+8] +.h_w4_loop: + movu xm2, [srcq+strideq*0] + vinserti32x4 ym2, [srcq+strideq*1], 1 + vinserti32x4 m2, [srcq+strideq*2], 2 + vinserti32x4 m2, [srcq+r6 ], 3 + lea srcq, [srcq+strideq*4] + mova m0, m10 + pshufb m1, m2, m4 + vpdpwssd m0, m6, m1 + pshufb m2, m5 + vpdpwssd m0, m7, m2 + vpermb m0, m9, m0 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m10, [prep_8tap_rnd] + lea r6, [strideq*3] + cmp wd, 4 + je .h_w4 + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + mov r5d, r7m + sub srcq, 6 + shr r5d, 11 + psllw xmm0, [base+prep_hv_shift+r5*8] + mova [tmpq], xmm0 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + cmp wd, 16 + je .h_w16 + jg .h_w32 +.h_w8: + mova m6, [spel_h_shufA] + movu m7, [spel_h_shufB] + movu m8, [spel_h_shufC] + mova m9, [spel_h_shufD] + mova m11, [prep_endB] +.h_w8_loop: + movu ym4, [srcq+strideq*0] + vinserti32x8 m4, [srcq+strideq*1], 1 + movu ym5, [srcq+strideq*2] + vinserti32x8 m5, [srcq+r6 ], 1 + lea srcq, [srcq+strideq*4] + mova m0, m10 + mova m1, m10 + vpermb m2, m6, m4 + vpermb m3, m6, m5 + vpdpwssd m0, m12, m2 + vpdpwssd m1, m12, m3 + vpermb m2, m7, m4 + vpermb m3, m7, m5 + vpdpwssd m0, m13, m2 + vpdpwssd m1, m13, m3 + vpermb m2, m8, m4 + vpermb m3, m8, m5 + vpdpwssd m0, m14, m2 + vpdpwssd m1, m14, m3 + vpermb m2, m9, m4 + vpermb m3, m9, m5 + vpdpwssd m0, m15, m2 + vpdpwssd m1, m15, m3 + vpermt2b m0, m11, m1 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + vbroadcasti32x4 m6, [spel_h_shufA] + vbroadcasti32x4 m7, [spel_h_shufB] + mova m11, [prep_endC] +.h_w16_loop: + movu ym2, [srcq+strideq*0+ 0] + vinserti32x8 m2, [srcq+strideq*1+ 0], 1 + movu ym3, [srcq+strideq*0+16] + vinserti32x8 m3, [srcq+strideq*1+16], 1 + lea srcq, [srcq+strideq*2] + mova m0, m10 + mova m1, m10 + pshufb m4, m2, m6 + vpdpwssd m0, m12, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m14, m4 ; b2 + pshufb m4, m2, m7 + vpdpwssd m0, m13, m4 ; a1 + pshufb m4, m3, m7 + vpdpwssd m1, m15, m4 ; b3 + shufpd m2, m3, 0x55 + pshufb m4, m2, m6 + vpdpwssd m0, m14, m4 ; a2 + vpdpwssd m1, m12, m4 ; b0 + pshufb m2, m7 + vpdpwssd m0, m15, m2 ; a3 + vpdpwssd m1, m13, m2 ; b1 + vpermt2b m0, m11, m1 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + vbroadcasti32x4 m6, [spel_h_shufA] + lea srcq, [srcq+wq*2] + vbroadcasti32x4 m7, [spel_h_shufB] + neg wq + mova m11, [prep_endC] +.h_w32_loop0: + mov r6, wq +.h_w32_loop: + movu m2, [srcq+r6*2+ 0] + movu m3, [srcq+r6*2+ 8] + mova m0, m10 + mova m1, m10 + pshufb m4, m2, m6 + vpdpwssd m0, m12, m4 ; a0 + pshufb m4, m3, m6 + vpdpwssd m1, m12, m4 ; b0 + vpdpwssd m0, m14, m4 ; a2 + movu m4, [srcq+r6*2+16] + pshufb m3, m7 + vpdpwssd m1, m13, m3 ; b1 + vpdpwssd m0, m15, m3 ; a3 + pshufb m3, m4, m6 + vpdpwssd m1, m14, m3 ; b2 + pshufb m2, m7 + vpdpwssd m0, m13, m2 ; a1 + pshufb m4, m7 + vpdpwssd m1, m15, m4 ; b3 + vpermt2b m0, m11, m1 + mova [tmpq], m0 + add tmpq, 64 + add r6, 32 + jl .h_w32_loop + add srcq, strideq + dec hd + jg .h_w32_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mov r5d, r7m + vpbroadcastd m10, [prep_8tap_rnd] + pmovsxbw xmm0, [base+subpel_filters+myq*8] + tzcnt r6d, wd + shr r5d, 11 + movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] + psllw xmm0, [base+prep_hv_shift+r5*8] + add r7, r6 + lea r6, [strideq*3] + sub srcq, r6 + mova [tmpq], xmm0 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + jmp r7 +.v_w4: + movq xmm1, [srcq+strideq*0] + vpbroadcastq ymm0, [srcq+strideq*1] + vpbroadcastq ymm2, [srcq+strideq*2] + add srcq, r6 + vpbroadcastq ymm4, [srcq+strideq*0] + vpbroadcastq ymm3, [srcq+strideq*1] + vpbroadcastq ymm5, [srcq+strideq*2] + mova xm11, [prep_endA] + add srcq, r6 + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklwd ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+strideq*0] + vpblendd ymm2, ymm4, 0x30 + vpblendd ymm4, ymm3, 0x30 + punpcklwd ymm2, ymm4 ; 23 34 + vpblendd ymm3, ymm5, 0x30 + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 45 56 +.v_w4_loop: + vpbroadcastq ymm5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + mova ymm4, ym10 + vpdpwssd ymm4, ym12, ymm1 ; a0 b0 + mova ymm1, ymm2 + vpdpwssd ymm4, ym13, ymm2 ; a1 b1 + mova ymm2, ymm3 + vpdpwssd ymm4, ym14, ymm3 ; a2 b2 + vpblendd ymm3, ymm0, ymm5, 0x30 + vpbroadcastq ymm0, [srcq+strideq*0] + vpblendd ymm5, ymm0, 0x30 + punpcklwd ymm3, ymm5 ; 67 78 + vpdpwssd ymm4, ym15, ymm3 ; a3 b3 + vpermb ymm4, ym11, ymm4 + mova [tmpq], xmm4 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + vbroadcasti32x4 m2, [srcq+strideq*2] + vinserti32x4 m1, m2, [srcq+strideq*0], 0 + vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2 + add srcq, r6 + vinserti32x4 ym2, [srcq+strideq*0], 1 + vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4 + mova m6, [spel_v_shuf8] + movu xm0, [srcq+strideq*1] + vinserti32x4 ym0, [srcq+strideq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 + mova ym11, [prep_endB] + vpermb m1, m6, m1 ; 01 12 + vpermb m2, m6, m2 ; 23 34 + vpermb m3, m6, m0 ; 45 56 +.v_w8_loop: + vinserti32x4 m0, [srcq+strideq*1], 3 + lea srcq, [srcq+strideq*2] + movu xm5, [srcq+strideq*0] + mova m4, m10 + vpdpwssd m4, m12, m1 ; a0 b0 + mova m1, m2 + vshufi32x4 m0, m5, q1032 ; 6 7 8 + vpdpwssd m4, m13, m2 ; a1 b1 + mova m2, m3 + vpdpwssd m4, m14, m3 ; a2 b2 + vpermb m3, m6, m0 ; 67 78 + vpdpwssd m4, m15, m3 ; a3 b3 + vpermb m4, m11, m4 + mova [tmpq], ym4 + add tmpq, 32 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti32x8 m1, [srcq+strideq*1] + vinserti32x8 m0, m1, [srcq+strideq*0], 0 + vinserti32x8 m1, [srcq+strideq*2], 1 + mova m8, [spel_v_shuf16] + add srcq, r6 + movu ym3, [srcq+strideq*0] + vinserti32x8 m3, [srcq+strideq*1], 1 + movu ym5, [srcq+strideq*2] + add srcq, r6 + vinserti32x8 m5, [srcq+strideq*0], 1 + mova m11, [prep_endA] + vpermb m0, m8, m0 ; 01 + vpermb m1, m8, m1 ; 12 + vpermb m3, m8, m3 ; 34 + vpermb m5, m8, m5 ; 56 + vpshrdd m2, m1, m3, 16 ; 23 + vpshrdd m4, m3, m5, 16 ; 45 +.v_w16_loop: + mova m6, m10 + mova m7, m10 + vpdpwssd m6, m12, m0 ; a0 + mova m0, m2 + vpdpwssd m7, m12, m1 ; b0 + mova m1, m3 + vpdpwssd m6, m13, m2 ; a1 + mova m2, m4 + vpdpwssd m7, m13, m3 ; b1 + mova m3, m5 + vpdpwssd m6, m14, m4 ; a2 + mova m4, m5 + vpdpwssd m7, m14, m5 ; b2 + movu ym5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti32x8 m5, [srcq+strideq*0], 1 + vpermb m5, m8, m5 ; 78 + vpshrdd m4, m5, 16 ; 67 + vpdpwssd m6, m15, m4 ; a3 + vpdpwssd m7, m15, m5 ; b3 + vpermt2b m6, m11, m7 + mova [tmpq], m6 + add tmpq, 64 + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +.v_w64: +.v_w128: +%if WIN64 + PUSH r8 + movaps [rsp+stack_offset+8], xmm6 +%endif + lea r5, [hq+wq*8-256] + mov r7, srcq + mov r8, tmpq +.v_w32_loop0: + movu m16, [srcq+strideq*0] + movu m17, [srcq+strideq*1] + movu m18, [srcq+strideq*2] + add srcq, r6 + movu m19, [srcq+strideq*0] + movu m20, [srcq+strideq*1] + movu m21, [srcq+strideq*2] + add srcq, r6 + movu m22, [srcq+strideq*0] + mova m11, [prep_endC] + punpcklwd m0, m16, m17 ; 01l + punpckhwd m16, m17 ; 01h + punpcklwd m1, m17, m18 ; 12l + punpckhwd m17, m18 ; 12h + punpcklwd m2, m18, m19 ; 23l + punpckhwd m18, m19 ; 23h + punpcklwd m3, m19, m20 ; 34l + punpckhwd m19, m20 ; 34h + punpcklwd m4, m20, m21 ; 45l + punpckhwd m20, m21 ; 45h + punpcklwd m5, m21, m22 ; 56l + punpckhwd m21, m22 ; 56h +.v_w32_loop: + mova m6, m10 + vpdpwssd m6, m12, m0 ; a0l + mova m8, m10 + vpdpwssd m8, m12, m16 ; a0h + mova m7, m10 + vpdpwssd m7, m12, m1 ; b0l + mova m9, m10 + vpdpwssd m9, m12, m17 ; b0h + mova m0, m2 + vpdpwssd m6, m13, m2 ; a1l + mova m16, m18 + vpdpwssd m8, m13, m18 ; a1h + mova m1, m3 + vpdpwssd m7, m13, m3 ; b1l + mova m17, m19 + vpdpwssd m9, m13, m19 ; b1h + mova m2, m4 + vpdpwssd m6, m14, m4 ; a2l + mova m18, m20 + vpdpwssd m8, m14, m20 ; a2h + mova m3, m5 + vpdpwssd m7, m14, m5 ; b2l + mova m19, m21 + vpdpwssd m9, m14, m21 ; b2h + movu m21, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklwd m4, m22, m21 ; 67l + punpckhwd m20, m22, m21 ; 67h + movu m22, [srcq+strideq*0] + vpdpwssd m6, m15, m4 ; a3l + vpdpwssd m8, m15, m20 ; a3h + punpcklwd m5, m21, m22 ; 78l + punpckhwd m21, m22 ; 78h + vpdpwssd m7, m15, m5 ; b3l + vpdpwssd m9, m15, m21 ; b3h + vpermt2b m6, m11, m8 + vpermt2b m7, m11, m9 + mova [tmpq+wq*0], m6 + mova [tmpq+wq*2], m7 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w32_loop + add r7, 64 + add r8, 64 + movzx hd, r5b + mov srcq, r7 + mov tmpq, r8 + sub r5d, 1<<8 + jg .v_w32_loop0 +%if WIN64 + movaps xmm6, [rsp+stack_offset+8] + POP r8 +%endif + vzeroupper + RET +.hv: + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + mov r5d, r7m + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [strideq*3] + sub srcq, 2 + shr r5d, 11 + sub srcq, r6 + psllw xmm0, [base+prep_hv_shift+r5*8] + psllw xmm1, 2 + vpbroadcastd m10, [prep_8tap_rnd] + vpbroadcastd ym11, [pd_128] + mova xm21, [prep_endA] + mova [tmpq+ 0], xmm0 + mova [tmpq+16], xmm1 + vpbroadcastd m8, [tmpq+ 4] + vpbroadcastd m9, [tmpq+ 8] + vpbroadcastd ym12, xmm1 + vpbroadcastd ym13, [tmpq+20] + vpbroadcastd ym14, [tmpq+24] + vpbroadcastd ym15, [tmpq+28] + movu xm4, [srcq+strideq*0] + vinserti32x4 ym4, [srcq+strideq*1], 1 + vinserti32x4 m4, [srcq+strideq*2], 2 + add srcq, r6 + vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3 + movu xm0, [srcq+strideq*1] + vinserti32x4 ym0, [srcq+strideq*2], 1 + add srcq, r6 + vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 + vbroadcasti32x4 m19, [spel_h_shufA] + vbroadcasti32x4 m20, [spel_h_shufB] + mova ym6, [spel_shuf4a] + mova ym7, [spel_shuf4b] + mova m2, m10 + mova m3, m10 + pshufb m1, m4, m19 + vpdpwssd m2, m8, m1 + pshufb m1, m0, m19 + vpdpwssd m3, m8, m1 + pshufb m4, m20 + vpdpwssd m2, m9, m4 + pshufb m0, m20 + vpdpwssd m3, m9, m0 + vpermb m1, m6, m2 ; 01 12 + vshufi32x4 m2, m3, q1032 + vpermb m3, m6, m3 ; 45 56 + vpermb m2, m6, m2 ; 23 34 +.hv_w4_loop: + movu xm18, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti128 ym18, [srcq+strideq*0], 1 + mova ym16, ym11 + mova ym4, ym10 + pshufb ym17, ym18, ym19 + vpdpwssd ym16, ym12, ym1 ; a0 b0 + vpdpwssd ym4, ym8, ym17 + pshufb ym18, ym20 + mova ym1, ym2 + vpdpwssd ym16, ym13, ym2 ; a1 b1 + vpdpwssd ym4, ym9, ym18 ; 7 8 + mova ym2, ym3 + vpdpwssd ym16, ym14, ym3 ; a2 b2 + vpermt2b ym3, ym7, ym4 ; 67 78 + vpdpwssd ym16, ym15, ym3 ; a3 b3 + vpermb ym16, ym21, ym16 + mova [tmpq], xm16 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + vzeroupper + RET +.hv_w8: + shr mxd, 16 + pmovsxbw xmm0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + mov r5d, r7m + pmovsxbw xmm1, [base+subpel_filters+myq*8] + lea r6, [strideq*3] + sub srcq, 6 + shr r5d, 11 + sub srcq, r6 + vpbroadcastd m10, [prep_8tap_rnd] + vpbroadcastd m11, [pd_128] + psllw xmm0, [base+prep_hv_shift+r5*8] + psllw xmm1, 2 + mova [tmpq+ 0], xmm0 + mova [tmpq+16], xmm1 + vpbroadcastd m12, xmm0 + vpbroadcastd m13, [tmpq+ 4] + vpbroadcastd m14, [tmpq+ 8] + vpbroadcastd m15, [tmpq+12] + vpbroadcastd m16, xmm1 + vpbroadcastd m17, [tmpq+20] + vpbroadcastd m18, [tmpq+24] + vpbroadcastd m19, [tmpq+28] + cmp wd, 16 + je .hv_w16 + jg .hv_w32 + WIN64_SPILL_XMM 23 + mova m5, [spel_h_shufA] + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 + movu ym9, [srcq+strideq*2] + add srcq, r6 + vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 + movu ym20, [srcq+strideq*1] + vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 + add srcq, r6 + movu ym21, [srcq+strideq*0] ; 6 + movu m6, [spel_h_shufB] + movu m7, [spel_h_shufC] + mova ym22, [prep_endB] + vpermb m8, m5, m0 + mova m1, m10 + vpdpwssd m1, m12, m8 ; a0 b0 + vpermb m8, m5, m9 + mova m2, m10 + vpdpwssd m2, m12, m8 ; c0 d0 + vpermb m8, m5, m20 + mova m3, m10 + vpdpwssd m3, m12, m8 ; e0 f0 + vpermb m8, m5, m21 + mova m4, m10 + vpdpwssd m4, m12, m8 ; g0 + vpermb m8, m6, m0 + vpdpwssd m1, m13, m8 ; a1 b1 + vpermb m8, m6, m9 + vpdpwssd m2, m13, m8 ; c1 d1 + vpermb m8, m6, m20 + vpdpwssd m3, m13, m8 ; e1 f1 + vpermb m8, m6, m21 + vpdpwssd m4, m13, m8 ; g1 + vpermb m8, m7, m0 + vpdpwssd m1, m14, m8 ; a2 b2 + vpermb m8, m7, m9 + vpdpwssd m2, m14, m8 ; c2 d2 + vpermb m8, m7, m20 + vpdpwssd m3, m14, m8 ; e2 f2 + vpermb m8, m7, m21 + vpdpwssd m4, m14, m8 ; g2 + mova m8, [spel_h_shufD] + vpermb m0, m8, m0 + vpdpwssd m1, m15, m0 ; a3 b3 + mova m0, [spel_shuf8a] + vpermb m9, m8, m9 + vpdpwssd m2, m15, m9 ; c3 d3 + mova m9, [spel_shuf8b] + vpermb m20, m8, m20 + vpdpwssd m3, m15, m20 ; e3 f3 + vpermb m21, m8, m21 + vpdpwssd m4, m15, m21 ; g3 + vpermt2b m1, m0, m2 ; 01 12 + vpermt2b m2, m0, m3 ; 23 34 + vpermt2b m3, m0, m4 ; 45 56 +.hv_w8_loop: + movu ym0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti32x8 m0, [srcq+strideq*0], 1 + mova m4, m10 + mova m20, m11 + vpermb m21, m5, m0 + vpdpwssd m4, m12, m21 ; h0 i0 + vpermb m21, m6, m0 + vpdpwssd m20, m16, m1 ; A0 B0 + vpdpwssd m4, m13, m21 ; h1 i1 + vpermb m21, m7, m0 + mova m1, m2 + vpdpwssd m20, m17, m2 ; A1 B1 + vpdpwssd m4, m14, m21 ; h2 i2 + vpermb m21, m8, m0 + mova m2, m3 + vpdpwssd m20, m18, m3 ; A2 B2 + vpdpwssd m4, m15, m21 ; h3 i3 + vpermt2b m3, m9, m4 ; 67 78 + vpdpwssd m20, m19, m3 ; A3 B3 + vpermb m20, m22, m20 + mova [tmpq], ym20 + add tmpq, 32 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 27 + vbroadcasti32x8 m5, [srcq+strideq*0+ 8] + vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 + vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 + movu ym6, [srcq+strideq*1+ 0] + movu ym7, [srcq+strideq*1+16] + vinserti32x8 m6, [srcq+strideq*2+ 0], 1 + vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 + add srcq, r6 + movu ym22, [srcq+strideq*0+ 0] + movu ym23, [srcq+strideq*0+16] + vinserti32x8 m22, [srcq+strideq*1+ 0], 1 + vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4 + movu ym24, [srcq+strideq*2+ 0] + movu ym25, [srcq+strideq*2+16] + add srcq, r6 + vinserti32x8 m24, [srcq+strideq*0+ 0], 1 + vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6 + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m9, [spel_shuf16] + mova m26, [prep_endB] + pshufb m0, m4, m20 + mova m1, m10 + vpdpwssd m1, m12, m0 ; a0 + pshufb m0, m6, m20 + mova m2, m10 + vpdpwssd m2, m12, m0 ; b0 + pshufb m0, m7, m20 + mova m3, m10 + vpdpwssd m3, m14, m0 ; c2 + pshufb m0, m4, m21 + vpdpwssd m1, m13, m0 ; a1 + pshufb m0, m6, m21 + vpdpwssd m2, m13, m0 ; b1 + pshufb m0, m7, m21 + vpdpwssd m3, m15, m0 ; c3 + pshufb m0, m5, m20 + vpdpwssd m1, m14, m0 ; a2 + shufpd m6, m7, 0x55 + pshufb m7, m6, m20 + vpdpwssd m2, m14, m7 ; b2 + vpdpwssd m3, m12, m7 ; c0 + pshufb m5, m21 + vpdpwssd m1, m15, m5 ; a3 + pshufb m6, m21 + vpdpwssd m2, m15, m6 ; b3 + vpdpwssd m3, m13, m6 ; c1 + pshufb m0, m22, m20 + mova m4, m10 + vpdpwssd m4, m12, m0 ; d0 + pshufb m0, m23, m20 + mova m5, m10 + vpdpwssd m5, m14, m0 ; e2 + pshufb m0, m24, m20 + mova m6, m10 + vpdpwssd m6, m12, m0 ; f0 + pshufb m0, m25, m20 + mova m7, m10 + vpdpwssd m7, m14, m0 ; g2 + pshufb m0, m22, m21 + vpdpwssd m4, m13, m0 ; d1 + pshufb m0, m23, m21 + vpdpwssd m5, m15, m0 ; e3 + pshufb m0, m24, m21 + vpdpwssd m6, m13, m0 ; f1 + pshufb m0, m25, m21 + vpdpwssd m7, m15, m0 ; g3 + shufpd m22, m23, 0x55 + pshufb m23, m22, m20 + vpdpwssd m4, m14, m23 ; d2 + vpdpwssd m5, m12, m23 ; e0 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m6, m14, m25 ; f2 + vpdpwssd m7, m12, m25 ; g0 + pshufb m22, m21 + vpdpwssd m4, m15, m22 ; d3 + vpdpwssd m5, m13, m22 ; e1 + pshufb m24, m21 + vpdpwssd m6, m15, m24 ; f3 + vpdpwssd m7, m13, m24 ; g1 + pslldq m1, 1 + vpermt2b m2, m9, m3 ; 12 + vpermt2b m4, m9, m5 ; 34 + vpermt2b m6, m9, m7 ; 56 + vpshrdd m1, m2, 16 ; 01 + vpshrdd m3, m2, m4, 16 ; 23 + vpshrdd m5, m4, m6, 16 ; 45 +.hv_w16_loop: + movu ym24, [srcq+strideq*1+ 0] + movu ym25, [srcq+strideq*1+16] + lea srcq, [srcq+strideq*2] + vinserti32x8 m24, [srcq+strideq*0+ 0], 1 + vinserti32x8 m25, [srcq+strideq*0+16], 1 + mova m7, m10 + mova m8, m10 + pshufb m0, m24, m20 + vpdpwssd m7, m12, m0 ; h0 + mova m22, m11 + pshufb m0, m25, m20 + vpdpwssd m8, m14, m0 ; i2 + mova m23, m11 + vpdpwssd m22, m16, m1 ; A0 + mova m1, m3 + vpdpwssd m23, m16, m2 ; B0 + mova m2, m4 + pshufb m0, m24, m21 + vpdpwssd m7, m13, m0 ; h1 + pshufb m0, m25, m21 + vpdpwssd m8, m15, m0 ; i3 + vpdpwssd m22, m17, m3 ; A1 + mova m3, m5 + vpdpwssd m23, m17, m4 ; B1 + mova m4, m6 + shufpd m24, m25, 0x55 + pshufb m25, m24, m20 + vpdpwssd m7, m14, m25 ; h2 + vpdpwssd m8, m12, m25 ; i0 + vpdpwssd m22, m18, m5 ; A2 + vpdpwssd m23, m18, m6 ; B2 + pshufb m24, m21 + vpdpwssd m7, m15, m24 ; h3 + vpdpwssd m8, m13, m24 ; i1 + vpermt2b m7, m9, m8 ; 78 + vpshrdd m5, m6, m7, 16 ; 67 + vpdpwssd m22, m19, m5 ; A3 + vpdpwssd m23, m19, m7 ; B3 + mova m6, m7 + vpermt2b m22, m26, m23 + mova [tmpq], m22 + add tmpq, 64 + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + PUSH r8 + %assign regs_used regs_used + 1 + WIN64_SPILL_XMM 32 +%endif + vbroadcasti32x4 m20, [spel_h_shufA] + vbroadcasti32x4 m21, [spel_h_shufB] + mova m22, [spel_shuf32] + lea r5d, [hq+wq*8-256] + mov r7, srcq + mov r8, tmpq +.hv_w32_loop0: + movu m6, [srcq+strideq*0+ 0] + movu m7, [srcq+strideq*0+ 8] + movu m8, [srcq+strideq*0+16] + mova m0, m10 + mova m23, m10 + pshufb m9, m6, m20 + vpdpwssd m0, m12, m9 ; a0l + pshufb m9, m7, m20 + vpdpwssd m23, m12, m9 ; a0h + vpdpwssd m0, m14, m9 ; a2l + pshufb m7, m21 + vpdpwssd m23, m13, m7 ; a1h + vpdpwssd m0, m15, m7 ; a3l + pshufb m7, m8, m20 + vpdpwssd m23, m14, m7 ; a2h + pshufb m6, m21 + vpdpwssd m0, m13, m6 ; a1l + pshufb m8, m21 + vpdpwssd m23, m15, m8 ; a3h + PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12 + PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34 + PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56 + vpshrdd m2, m1, m3, 16 ; 23l + vpshrdd m25, m24, m26, 16 ; 23h + vpshrdd m4, m3, m5, 16 ; 45l + vpshrdd m27, m26, m28, 16 ; 45h +.hv_w32_loop: + movu m7, [srcq+strideq*1+ 0] + movu m9, [srcq+strideq*2+ 0] + movu m6, [srcq+strideq*1+ 8] + movu m8, [srcq+strideq*2+ 8] + mova m29, m10 + mova m31, m10 + pshufb m30, m7, m20 + vpdpwssd m29, m12, m30 ; h0l + pshufb m30, m9, m20 + vpdpwssd m31, m12, m30 ; i0l + pshufb m7, m21 + vpdpwssd m29, m13, m7 ; h1l + pshufb m9, m21 + vpdpwssd m31, m13, m9 ; i1l + pshufb m7, m6, m20 + vpdpwssd m29, m14, m7 ; h2l + pshufb m9, m8, m20 + vpdpwssd m31, m14, m9 ; i2l + pshufb m6, m21 + vpdpwssd m29, m15, m6 ; h3l + pshufb m8, m21 + vpdpwssd m31, m15, m8 ; i3l + mova m30, m10 + vpdpwssd m30, m12, m7 ; h0h + movu m7, [srcq+strideq*1+16] + lea srcq, [srcq+strideq*2] + vpermt2b m29, m22, m31 ; 78l + mova m31, m10 + vpdpwssd m31, m12, m9 ; i0h + movu m9, [srcq+strideq*0+16] + vpdpwssd m30, m13, m6 ; h1h + pshufb m6, m7, m20 + vpdpwssd m31, m13, m8 ; i1h + pshufb m8, m9, m20 + vpdpwssd m30, m14, m6 ; h2h + mova m6, m11 + vpdpwssd m6, m16, m0 ; A0l + pshufb m7, m21 + vpdpwssd m31, m14, m8 ; i2h + mova m8, m11 + vpdpwssd m8, m16, m23 ; A0h + pshufb m9, m21 + vpdpwssd m30, m15, m7 ; h3h + mova m7, m11 + vpdpwssd m7, m16, m1 ; B0l + vpdpwssd m31, m15, m9 ; i3h + mova m9, m11 + vpdpwssd m9, m16, m24 ; B0h + mova m0, m2 + vpdpwssd m6, m17, m2 ; A1l + mova m23, m25 + vpdpwssd m8, m17, m25 ; A1h + mova m1, m3 + vpdpwssd m7, m17, m3 ; B1l + mova m24, m26 + vpdpwssd m9, m17, m26 ; B1h + vpermt2b m30, m22, m31 ; 78h + mova m31, [prep_endC] + vpdpwssd m6, m18, m4 ; A2l + mova m2, m4 + vpdpwssd m8, m18, m27 ; A2h + mova m25, m27 + vpdpwssd m7, m18, m5 ; B2l + mova m3, m5 + vpdpwssd m9, m18, m28 ; B2h + mova m26, m28 + vpshrdd m4, m5, m29, 16 ; 67l + vpdpwssd m6, m19, m4 ; A3l + vpshrdd m27, m28, m30, 16 ; 67h + vpdpwssd m8, m19, m27 ; A3h + mova m5, m29 + vpdpwssd m7, m19, m29 ; B3l + mova m28, m30 + vpdpwssd m9, m19, m30 ; B3h + vpermt2b m6, m31, m8 + vpermt2b m7, m31, m9 + mova [tmpq+wq*0], m6 + mova [tmpq+wq*2], m7 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w32_loop + add r7, 64 + add r8, 64 + movzx hd, r5b + mov srcq, r7 + mov tmpq, r8 + sub r5d, 1<<8 + jg .hv_w32_loop0 + RET + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts +%define base r6-pd_0to7 + mov t0d, r7m + lea r6, [pd_0to7] + shr t0d, 11 + vpbroadcastd m8, [base+warp_8x8t_rnd_v] + vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main + psrad m14, m16, 15 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 + psrad m16, 15 + packssdw m14, m16 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 + psrad m15, m16, 15 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 + add tsq, tsq + psrad m16, 15 + packssdw m15, m16 + jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end + +cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd + mov t0d, r7m ; pixel_max + lea r6, [pd_0to7] + shr t0d, 11 + vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] + vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] + call .main + psrad m14, m16, 13 + call .main2 + psrad m16, 13 + packusdw m14, m16 + call .main2 + psrad m15, m16, 13 + call .main2 + vpbroadcastd m0, [base+bidir_shift+t0*4] + vpsrlvw m14, m0 + psrad m16, 13 + packusdw m15, m16 + vpsrlvw m15, m0 +.end: + mova m0, [base+warp8x8_end] + vpermb m16, m0, m14 + lea r2, [dsq*3] + mova [dstq+dsq*0], xm16 + vextracti128 [dstq+dsq*1], ym16, 1 + vextracti32x4 [dstq+dsq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + vpermb m16, m0, m15 + lea dstq, [dstq+dsq*4] + mova [dstq+dsq*0], xm16 + vextracti128 [dstq+dsq*1], ym16, 1 + vextracti32x4 [dstq+dsq*2], m16, 2 + vextracti32x4 [dstq+r2 ], m16, 3 + RET +.main: + vpbroadcastd ym3, [base+pd_512] +%if WIN64 + mov abcdq, r5mp + vpaddd ym18, ym3, r6m {1to8} ; mx +%else + add r5d, 512 + vpbroadcastd ym18, r5d +%endif + vpaddd ym20, ym3, r7m {1to8} ; my + mova ym16, [base+pd_0to7] + vpbroadcastd ym19, [abcdq+4*0] ; alpha + vpbroadcastd ym21, [abcdq+4*1] ; gamma + lea r4, [ssq*3+6] + vpdpwssd ym18, ym19, ym16 ; tmx + vpdpwssd ym20, ym21, ym16 ; tmy + sub srcq, r4 + mova m10, [base+warp8x8_permA] + lea r4, [mc_warp_filter+64*8] + vbroadcasti32x4 m12, [base+warp8x8_permC] + kxnorb k1, k1, k1 + vbroadcasti32x4 m13, [base+warp8x8_permD] + movu ym5, [srcq+0] + vinserti32x8 m5, [srcq+8], 1 + psrad ym17, ym18, 10 + mova m11, [base+warp8x8_permB] + kmovb k2, k1 + vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 + psrad ym19, 16 ; beta + psrad ym21, 16 ; delta + paddd ym18, ym19 + vpermb m4, m10, m5 + vpbroadcastq m9, [base+warp_shift_h+t0*8] + pshufd m3, m3, q3120 + paddd m7, m1, m1 + pshufb m2, m3, m12 + vpdpwssd m1, m4, m2 + vpermb m5, m11, m5 + vshufi32x4 m4, m5, q1021 + pshufb m3, m13 + vpdpwssd m1, m4, m3 + call .h + psllq m2, m1, 32 + paddd m1, m2 + vpmultishiftqb m1, m9, m1 + vpshrdq m1, m0, 48 ; 01 12 + call .h + vpshrdq m2, m1, m0, 48 ; 23 34 + call .h + vpshrdq m3, m2, m0, 48 ; 45 56 +.main2: + call .h + psrad ym6, ym20, 10 + kmovb k1, k2 + paddd ym17, ym20, ym21 ; my += delta + vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 + psrad ym16, ym17, 10 + kmovb k2, k1 + vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 + shufps m5, m20, m6, q2020 + mova m16, m8 + pshufb m4, m5, m12 + vpdpwssd m16, m1, m4 ; a0 b0 + pshufb m5, m13 + mova m1, m2 + vpdpwssd m16, m2, m5 ; a1 b1 + shufps m6, m20, m6, q3131 + paddd ym20, ym17, ym21 + pshufb m4, m6, m12 + mova m2, m3 + vpdpwssd m16, m3, m4 ; a2 b2 + vpshrdq m3, m0, 48 ; 67 78 + pshufb m6, m13 + vpdpwssd m16, m3, m6 ; a3 b3 + ret +ALIGN function_align +.h: + movu ym16, [srcq+ssq*1] + psrad ym6, ym18, 10 + lea srcq, [srcq+ssq*2] + vinserti32x8 m5, m16, [srcq+ssq*0], 1 + kmovb k1, k2 + paddd ym17, ym18, ym19 ; mx += beta + vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 + psrad ym16, ym17, 10 + kmovb k2, k1 + vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 + vpermb m4, m10, m5 + shufps m16, m18, m6, q2020 + shufps m6, m18, m6, q3131 + mova m0, m7 + pshufb m18, m16, m12 + vpdpwssd m0, m4, m18 ; a0 b0 + vpermb m5, m11, m5 + pshufb m18, m6, m13 + vpdpwssd m0, m5, m18 ; a3 b3 + paddd ym18, ym17, ym19 + vshufi32x4 m17, m4, m5, q1021 + pshufb m16, m13 + vpdpwssd m0, m17, m16 ; a1 b1 + vshufi32x4 m4, m5, q2132 + pshufb m6, m12 + vpdpwssd m0, m4, m6 ; a2 b2 + vpmultishiftqb m0, m9, m0 ; a a b b + ret + +%macro BIDIR_FN 0 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + cmp hd, 8 + jl .w4_end + vextracti32x4 xm2, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq ], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq ], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xm0, ym1, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + vextracti32x4 xm0, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main + mova [dstq+64*2], m0 + mova [dstq+64*3], m1 + dec hd + jg .w128_loop + RET +%endmacro + +%if WIN64 +DECLARE_REG_TMP 5 +%else +DECLARE_REG_TMP 7 +%endif + +cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg_avx512icl_table + lea r6, [avg_avx512icl_table] + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + vpbroadcastd m2, [base+avg_round+t0*4] + vpbroadcastd m3, [base+avg_shift+t0*4] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+64*0] + paddsw m0, [tmp2q+64*0] + mova m1, [tmp1q+64*1] + paddsw m1, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + pmaxsw m0, m2 + pmaxsw m1, m2 + psubsw m0, m2 + psubsw m1, m2 + vpsrlvw m0, m3 + vpsrlvw m1, m3 + ret + +cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-w_avg_avx512icl_table + lea r6, [w_avg_avx512icl_table] + tzcnt wd, wm + mov t0d, r7m ; pixel_max + shr t0d, 11 + movsxd wq, [r6+wq*4] + vpbroadcastd m5, [base+w_avg_round+t0*4] + vpbroadcastd m7, [base+bidir_shift+t0*4] + add wq, r6 + mov r6d, r6m ; weight + lea t0d, [r6-16] + shl r6d, 16 + sub r6d, t0d ; 16-weight, weight + movifnidn hd, hm + vpbroadcastd m6, r6d + BIDIR_FN +ALIGN function_align +.main: + mova m3, [tmp1q+64*0] + mova m1, [tmp2q+64*0] + mova m0, [tmp1q+64*1] + mova m4, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + punpcklwd m2, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + mova m0, m5 + vpdpwssd m0, m6, m2 + mova m2, m5 + vpdpwssd m2, m6, m1 + mova m1, m5 + vpdpwssd m1, m6, m3 + mova m3, m5 + vpdpwssd m3, m6, m4 + REPX {psrad x, 2}, m0, m2, m1, m3 + packusdw m0, m2 + packusdw m1, m3 + vpsrlvw m0, m7 + vpsrlvw m1, m7 + ret + +cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask_avx512icl_table + lea r7, [mask_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_64] + vpbroadcastd m9, [base+mask_round+r6*4] + vpbroadcastd m10, [base+bidir_shift+r6*4] + mov maskq, maskmp + add wq, r7 + BIDIR_FN +ALIGN function_align +.main: + pmovzxbw m1, [maskq+32*0] + mova m4, [tmp1q+64*0] + mova m2, [tmp2q+64*0] + pmovzxbw m6, [maskq+32*1] + mova m5, [tmp1q+64*1] + mova m3, [tmp2q+64*1] + add maskq, 32*2 + add tmp1q, 64*2 + add tmp2q, 64*2 + punpcklwd m7, m4, m2 + punpckhwd m4, m2 + psubw m0, m8, m1 + punpcklwd m2, m1, m0 ; m, 64-m + punpckhwd m1, m0 + mova m0, m9 + vpdpwssd m0, m7, m2 + mova m2, m9 + vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) + punpcklwd m7, m5, m3 + punpckhwd m5, m3 + psubw m1, m8, m6 + punpcklwd m3, m6, m1 + punpckhwd m6, m1 + mova m1, m9 + vpdpwssd m1, m7, m3 + mova m3, m9 + vpdpwssd m3, m5, m6 + REPX {psrad x, 4}, m0, m2, m1, m3 + packusdw m0, m2 + packusdw m1, m3 + vpsrlvw m0, m10 + vpsrlvw m1, m10 + ret + +cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx512icl_table + lea r7, [w_mask_420_avx512icl_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m11, [base+pw_64] + vpbroadcastd m12, [base+mask_round+r6*4] + vpbroadcastd m13, [base+bidir_shift+r6*4] + mov r6d, r7m ; sign + vpbroadcastd m14, [base+w_mask_round+r6*4] + mova ym15, [w_mask_end42x] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + mova m4, [w_mask_shuf4] + vpermt2b m2, m4, m3 + mova m3, m14 + vpdpbusd m3, m2, [pb_64] {1to16} + vpermb m3, m15, m3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + mova [maskq], xm3 + cmp hd, 8 + jl .w4_end + vextracti32x4 xm2, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8: + mova m8, [w_mask_shuf8] + vpbroadcastd m9, [pb_64] + jmp .w8_start +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w8_start: + vpermt2b m2, m8, m3 + mova m3, m14 + vpdpbusd m3, m2, m9 + vpermb m3, m15, m3 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + mova [maskq], xm3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16: + mova m8, [w_mask_shuf16] + vpbroadcastd m9, [pb_64] + jmp .w16_start +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w16_start: + vpermt2b m2, m8, m3 + mova m3, m14 + vpdpbusd m3, m2, m9 + vpermb m3, m15, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + mova [maskq], xm3 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w32: + paddw m2, m3 + mova m8, m14 + vpdpwssd m8, m11, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + call .main + paddw m2, m3 + mova m3, m14 + vpdpwssd m3, m11, m2 + vpermt2b m8, m15, m3 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + mova [maskq], ym8 + sub hd, 4 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w64: + mova m8, m2 + mova m9, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + call .main + paddw m8, m2 + paddw m9, m3 + mova m2, m14 + vpdpwssd m2, m11, m8 + mova m3, m14 + vpdpwssd m3, m11, m9 + vpermt2b m2, m15, m3 + mova [dstq+strideq*1+64*0], m0 + mova [dstq+strideq*1+64*1], m1 + mova [maskq], ym2 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 64 +.w128: + mova m16, m2 + mova m8, m3 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*0+64*1], m1 + call .main + mova m17, m2 + mova m9, m3 + mova [dstq+strideq*0+64*2], m0 + mova [dstq+strideq*0+64*3], m1 + call .main + paddw m2, m16 + paddw m3, m8 + mova m16, m14 + vpdpwssd m16, m11, m2 + mova m8, m14 + vpdpwssd m8, m11, m3 + mova [dstq+strideq*1+64*0], m0 + mova [dstq+strideq*1+64*1], m1 + call .main + paddw m2, m17 + paddw m3, m9 + mova m17, m14 + vpdpwssd m17, m11, m2 + mova m9, m14 + vpdpwssd m9, m11, m3 + vpermt2b m16, m15, m8 + vpermt2b m17, m15, m9 + mova [dstq+strideq*1+64*2], m0 + mova [dstq+strideq*1+64*3], m1 + mova [maskq+32*0], ym16 + mova [maskq+32*1], ym17 + sub hd, 2 + jg .w128_loop + vzeroupper + RET +ALIGN function_align +.main: + mova m1, [tmp1q+64*0] + mova m3, [tmp2q+64*0] + mova m4, [tmp1q+64*1] + mova m7, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + psubsw m6, m1, m3 + punpcklwd m5, m3, m1 + pabsw m6, m6 + punpckhwd m3, m1 + psubusw m6, m10, m6 + psrlw m6, 10 ; 64-m + psubw m2, m11, m6 ; m + punpcklwd m1, m6, m2 + punpckhwd m6, m2 + mova m0, m12 + vpdpwssd m0, m5, m1 + mova m1, m12 + vpdpwssd m1, m3, m6 + psubsw m5, m4, m7 + punpcklwd m6, m7, m4 + pabsw m5, m5 + punpckhwd m7, m4 + psubusw m5, m10, m5 + psrlw m5, 10 + psubw m3, m11, m5 + punpcklwd m4, m5, m3 + psrad m0, 4 + punpckhwd m5, m3 + psrad m1, 4 + packusdw m0, m1 + mova m1, m12 + vpdpwssd m1, m6, m4 + mova m4, m12 + vpdpwssd m4, m7, m5 + psrad m1, 4 + psrad m4, 4 + packusdw m1, m4 + vpsrlvw m0, m13 + vpsrlvw m1, m13 + ret + +cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx512icl_table + lea r7, [w_mask_422_avx512icl_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m9, [base+pw_64] + vpbroadcastd m10, [base+mask_round+r6*4] + vpbroadcastd m11, [base+bidir_shift+r6*4] + mov r6d, r7m ; sign + vpbroadcastd m12, [base+w_mask_round+r6*4] + mova ym13, [w_mask_end42x] + mov maskq, maskmp + add wq, r7 + paddw m14, m9, m9 ; pw_128 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + cmp hd, 8 + jl .w4_end + vextracti32x4 xm2, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main + mova [dstq+64*2], m0 + mova [dstq+64*3], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + mova m1, [tmp1q+64*0] + mova m3, [tmp2q+64*0] + mova m4, [tmp1q+64*1] + mova m7, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + psubsw m6, m1, m3 + punpcklwd m5, m3, m1 + pabsw m6, m6 + punpckhwd m3, m1 + psubusw m6, m8, m6 + psrlw m6, 10 + psubw m2, m9, m6 + punpcklwd m1, m6, m2 + punpckhwd m6, m2 + mova m0, m10 + vpdpwssd m0, m5, m1 + mova m1, m10 + vpdpwssd m1, m3, m6 + psubsw m5, m4, m7 + punpcklwd m6, m7, m4 + pabsw m5, m5 + punpckhwd m7, m4 + psubusw m5, m8, m5 + psrlw m5, 10 + psubw m3, m9, m5 + punpcklwd m4, m5, m3 + psrad m0, 4 + punpckhwd m5, m3 + psrad m1, 4 + packusdw m0, m1 + mova m1, m10 + vpdpwssd m1, m6, m4 + mova m4, m10 + vpdpwssd m4, m7, m5 + mova m5, m12 + vpdpwssd m5, m14, m2 + mova m2, m12 + vpdpwssd m2, m14, m3 + psrad m1, 4 + psrad m4, 4 + packusdw m1, m4 + vpermt2b m5, m13, m2 + vpsrlvw m0, m11 + vpsrlvw m1, m11 + mova [maskq], ym5 + add maskq, 32 + ret + +cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx512icl_table + lea r7, [w_mask_444_avx512icl_table] + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movifnidn hd, hm + shr r6d, 11 + movsxd wq, [r7+wq*4] + vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + vpbroadcastd m9, [base+pw_64] + vpbroadcastd m10, [base+mask_round+r6*4] + mova m11, [w_mask_end444] + vpbroadcastd m12, [base+bidir_shift+r6*4] + mov maskq, maskmp + add wq, r7 + call .main + lea stride3q, [strideq*3] + jmp wq +.w4: + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + cmp hd, 8 + jl .w4_end + vextracti32x4 xm2, m0, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + je .w4_end + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 + lea dstq, [dstq+strideq*4] + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 +.w4_end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*4] +.w8: + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 8 + jl .w8_end + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], xm1 + vextracti32x4 [dstq+strideq*1], ym1, 1 + vextracti32x4 [dstq+strideq*2], m1, 2 + vextracti32x4 [dstq+stride3q ], m1, 3 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*4] +.w16: + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+stride3q ], m1, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] +.w32: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + call .main + mova [dstq+64*2], m0 + mova [dstq+64*3], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + mova m1, [tmp1q+64*0] + mova m3, [tmp2q+64*0] + mova m4, [tmp1q+64*1] + mova m7, [tmp2q+64*1] + add tmp1q, 64*2 + add tmp2q, 64*2 + psubsw m6, m1, m3 + punpcklwd m5, m3, m1 + pabsw m6, m6 + punpckhwd m3, m1 + psubusw m6, m8, m6 + psrlw m6, 10 + psubw m2, m9, m6 + punpcklwd m1, m6, m2 + punpckhwd m6, m2 + mova m0, m10 + vpdpwssd m0, m5, m1 + mova m1, m10 + vpdpwssd m1, m3, m6 + psubsw m5, m4, m7 + punpcklwd m6, m7, m4 + pabsw m5, m5 + punpckhwd m7, m4 + psubusw m5, m8, m5 + psrlw m5, 10 + psubw m3, m9, m5 + punpcklwd m4, m5, m3 + psrad m0, 4 + punpckhwd m5, m3 + psrad m1, 4 + packusdw m0, m1 + mova m1, m10 + vpdpwssd m1, m6, m4 + mova m4, m10 + vpdpwssd m4, m7, m5 + vpermt2b m2, m11, m3 + psrad m1, 4 + psrad m4, 4 + packusdw m1, m4 + vpsrlvw m0, m12 + vpsrlvw m1, m12 + mova [maskq], m2 + add maskq, 64 + ret + +cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_avx512icl_table + lea r6, [blend_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + vpbroadcastd m6, [base+pw_m512] + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + pmovzxbw ym19, [maskq] + movq xm16, [dstq+dsq*0] + movhps xm16, [dstq+dsq*1] + vpbroadcastq ym17, [dstq+dsq*2] + vpbroadcastq ym18, [dstq+r6 ] + pmullw ym19, ym6 + vpblendd ym16, ym17, 0x30 + vpblendd ym16, ym18, 0xc0 + psubw ym17, ym16, [tmpq] + add maskq, 16 + add tmpq, 32 + pmulhrsw ym17, ym19 + paddw ym16, ym17 + vextracti128 xm17, ym16, 1 + movq [dstq+dsq*0], xm16 + movhps [dstq+dsq*1], xm16 + movq [dstq+dsq*2], xm17 + movhps [dstq+r6 ], xm17 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + vzeroupper + RET +.w8: + pmovzxbw m2, [maskq] + mova xm0, [dstq+dsq*0] + vinserti32x4 ym0, [dstq+dsq*1], 1 + vinserti32x4 m0, [dstq+dsq*2], 2 + vinserti32x4 m0, [dstq+r6 ], 3 + pmullw m2, m6 + psubw m1, m0, [tmpq] + add maskq, 32 + add tmpq, 64 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + vextracti32x4 [dstq+dsq*2], m0, 2 + vextracti32x4 [dstq+r6 ], m0, 3 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + RET +.w16: + pmovzxbw m4, [maskq+32*0] + pmovzxbw m5, [maskq+32*1] + mova ym0, [dstq+dsq*0] + vinserti32x8 m0, [dstq+dsq*1], 1 + mova ym1, [dstq+dsq*2] + vinserti32x8 m1, [dstq+r6 ], 1 + pmullw m4, m6 + pmullw m5, m6 + psubw m2, m0, [tmpq+64*0] + psubw m3, m1, [tmpq+64*1] + add maskq, 32*2 + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + mova [dstq+dsq*2], ym1 + vextracti32x8 [dstq+r6 ], m1, 1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w16 + RET +.w32: + pmovzxbw m4, [maskq+32*0] + pmovzxbw m5, [maskq+32*1] + mova m0, [dstq+dsq*0] + mova m1, [dstq+dsq*1] + pmullw m4, m6 + pmullw m5, m6 + psubw m2, m0, [tmpq+ 64*0] + psubw m3, m1, [tmpq+ 64*1] + add maskq, 32*2 + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32 + RET + +cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h + lea r5, [blend_v_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + vpbroadcastd xmm2, [obmc_masks_avx2+2*2] +.w2_loop: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movq xmm1, [tmpq] + add tmpq, 4*2 + psubw xmm1, xmm0, xmm1 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + vpbroadcastq xmm2, [obmc_masks_avx2+4*2] +.w4_loop: + movq xmm0, [dstq+dsq*0] + movhps xmm0, [dstq+dsq*1] + psubw xmm1, xmm0, [tmpq] + add tmpq, 8*2 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti32x4 ym0, [dstq+dsq*1], 1 + psubw ym1, ym0, [tmpq] + add tmpq, 16*2 + pmulhrsw ym1, ym2 + paddw ym0, ym1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] +.w16_loop: + mova ym0, [dstq+dsq*0] + vinserti32x8 m0, [dstq+dsq*1], 1 + psubw m1, m0, [tmpq] + add tmpq, 32*2 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: + mova m4, [obmc_masks_avx2+32*2] +.w32_loop: + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 64*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 64*1] + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32_loop + RET + +cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask +%define base r6-$$ + lea r6, [$$] + tzcnt wd, wm + mov hd, hm + movsxd wq, [base+blend_h_avx512icl_table+wq*4] + lea maskq, [base+obmc_masks_avx2+hq*2] + lea hd, [hq*3] + lea wq, [base+blend_h_avx512icl_table+wq] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movd xmm2, [maskq+hq*2] + movq xmm1, [tmpq] + add tmpq, 4*2 + punpcklwd xmm2, xmm2 + psubw xmm1, xmm0, xmm1 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova xmm3, [blend_shuf] +.w4_loop: + movq xmm0, [dstq+dsq*0] + movhps xmm0, [dstq+dsq*1] + movd xmm2, [maskq+hq*2] + psubw xmm1, xmm0, [tmpq] + add tmpq, 8*2 + pshufb xmm2, xmm3 + pmulhrsw xmm1, xmm2 + paddw xmm0, xmm1 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +.w8: + vbroadcasti32x4 ym3, [blend_shuf] + shufpd ym3, ym3, 0x0c +.w8_loop: + mova xm0, [dstq+dsq*0] + vinserti32x4 ym0, [dstq+dsq*1], 1 + vpbroadcastd ym2, [maskq+hq*2] + psubw ym1, ym0, [tmpq] + add tmpq, 16*2 + pshufb ym2, ym3 + pmulhrsw ym1, ym2 + paddw ym0, ym1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + RET +.w16: + vbroadcasti32x4 m3, [blend_shuf] + shufpd m3, m3, 0xf0 +.w16_loop: + mova ym0, [dstq+dsq*0] + vinserti32x8 m0, [dstq+dsq*1], 1 + vpbroadcastd m2, [maskq+hq*2] + psubw m1, m0, [tmpq] + add tmpq, 32*2 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16_loop + RET +.w32: + vpbroadcastw m4, [maskq+hq*2] + vpbroadcastw m5, [maskq+hq*2+2] + mova m0, [dstq+dsq*0] + psubw m2, m0, [tmpq+ 64*0] + mova m1, [dstq+dsq*1] + psubw m3, m1, [tmpq+ 64*1] + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w32 + RET +.w64: + vpbroadcastw m4, [maskq+hq*2] + mova m0, [dstq+64*0] + psubw m2, m0, [tmpq+64*0] + mova m1, [dstq+64*1] + psubw m3, m1, [tmpq+64*1] + add tmpq, 64*2 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + vpbroadcastw m8, [maskq+hq*2] + mova m0, [dstq+64*0] + psubw m4, m0, [tmpq+64*0] + mova m1, [dstq+64*1] + psubw m5, m1, [tmpq+64*1] + mova m2, [dstq+64*2] + psubw m6, m2, [tmpq+64*2] + mova m3, [dstq+64*3] + psubw m7, m3, [tmpq+64*3] + add tmpq, 64*4 + REPX {pmulhrsw x, m8}, m4, m5, m6, m7 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + mova [dstq+64*2], m2 + mova [dstq+64*3], m3 + add dstq, dsq + inc hq + jl .w128 + RET + +cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0, pxmax + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + mov r6, ~0 + vpbroadcastd m5, dxm + vpbroadcastd m8, mx0m + vpbroadcastd m6, src_wm + kmovq k6, r6 + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax + LEA r7, $$ +%define base r7-$$ + vpbroadcastd m3, [base+pd_16384] + vpbroadcastd m7, [base+pd_63] + mova m24, [base+resize_permA] + mova m25, [base+resize_permB] + mova m26, [base+resize_permC] + mova m27, [base+resize_permD] + vbroadcasti32x4 m28, [base+resize_shufA] + vbroadcasti32x4 m29, [base+resize_shufB] + mova m30, [base+resize_permE] + vpbroadcastw ym31, pxmaxm + vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] + pslld m5, 4 ; dx*16 + pslld m6, 14 + pxor m2, m2 +.loop_y: + xor xd, xd + mova m4, m8 ; per-line working version of mx +.loop_x: + pmaxsd m0, m4, m2 + psrad m9, m4, 8 ; filter offset (unmasked) + pminsd m0, m6 ; iclip(mx, 0, src_w-8) + psubd m1, m4, m0 ; pshufb offset + psrad m0, 14 ; clipped src_x offset + psrad m1, 14 ; pshufb edge_emu offset + vptestmd k5, m1, m1 + pand m9, m7 ; filter offset (masked) + ktestw k5, k5 + jz .load + vpbroadcastq m14, [base+pd_0_4] + vpermq m10, m0, q1100 + vpermq m11, m0, q3322 + vpermq m20, m1, q1100 + vpermq m21, m1, q3322 + punpckldq m10, m10 + punpckldq m11, m11 + punpckldq m20, m20 + punpckldq m21, m21 + paddd m10, m14 + paddd m11, m14 + paddd m20, m14 + paddd m21, m14 + vextracti32x8 ym12, m10, 1 + vextracti32x8 ym13, m11, 1 + vextracti32x8 ym22, m20, 1 + vextracti32x8 ym23, m21, 1 + kmovq k1, k6 + kmovq k2, k6 + kmovq k3, k6 + kmovq k4, k6 + vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3 + vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7 + vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B + vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F + kmovq k1, k6 + kmovq k2, k6 + kmovq k3, k6 + kmovq k4, k6 + vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2] + vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2] + vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2] + vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2] + pshufb m16, m0 + pshufb m17, m1 + pshufb m18, m14 + pshufb m19, m15 + mova m20, m24 + mova m22, m24 + mova m21, m25 + mova m23, m25 + vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b + vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d + vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb + vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd + mova m15, m26 + mova m17, m26 + mova m16, m27 + mova m18, m27 + vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa + vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb + vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc + vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd + kmovq k1, k6 + kmovq k2, k6 + vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] + vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] + pshufb m10, m11, m28 + pshufb m11, m11, m29 + pshufb m12, m13, m28 + pshufb m13, m13, m29 + jmp .filter +.load: + kmovq k1, k6 + kmovq k2, k6 + kmovq k3, k6 + kmovq k4, k6 + vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] + vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] + pshufb m10, m11, m28 + pshufb m11, m11, m29 + pshufb m12, m13, m28 + pshufb m13, m13, m29 + vpgatherdd m15{k3}, [srcq+m0*2+ 0] + vpgatherdd m16{k4}, [srcq+m0*2+ 4] + kmovq k1, k6 + kmovq k2, k6 + vpgatherdd m17{k1}, [srcq+m0*2+ 8] + vpgatherdd m18{k2}, [srcq+m0*2+12] +.filter: + mova m14, m2 + vpdpwssd m14, m15, m10 + vpdpwssd m14, m16, m11 + vpdpwssd m14, m17, m12 + vpdpwssd m14, m18, m13 + psubd m14, m3, m14 + psrad m14, 15 + packusdw m14, m14 + vpermq m14, m30, m14 + pminsw ym14, ym31 + mova [dstq+xq*2], ym14 + paddd m4, m5 + add xd, 16 + cmp xd, dst_wd + jl .loop_x + add dstq, dst_strideq + add srcq, src_strideq + dec hd + jg .loop_y + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/mc16_sse.asm b/third_party/dav1d/src/x86/mc16_sse.asm new file mode 100644 index 0000000000..fde8e372a3 --- /dev/null +++ b/third_party/dav1d/src/x86/mc16_sse.asm @@ -0,0 +1,8731 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA + +; dav1d_obmc_masks[] << 9 +obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 + dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0 + dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120 + dw 4096, 3072, 2048, 1536, 0, 0, 0, 0 + dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240 + dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608 + dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 + +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 +spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 +spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +rescale_mul: dd 0, 1, 2, 3 +resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 + db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 +bdct_lb_q: times 8 db 0 + times 8 db 4 + times 8 db 8 + times 8 db 12 + +pw_2: times 8 dw 2 +pw_16: times 4 dw 16 +prep_mul: times 4 dw 16 + times 8 dw 4 +pw_64: times 8 dw 64 +pw_256: times 8 dw 256 +pw_2048: times 4 dw 2048 +bidir_mul: times 4 dw 2048 +pw_8192: times 8 dw 8192 +pw_27615: times 8 dw 27615 +pw_32766: times 8 dw 32766 +pw_m512: times 8 dw -512 +pd_63: times 4 dd 63 +pd_64: times 4 dd 64 +pd_512: times 4 dd 512 +pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32 +pd_0x3ff: times 4 dd 0x3ff +pd_0x4000: times 4 dd 0x4000 +pq_0x400000: times 2 dq 0x400000 +pq_0x40000000: times 2 dq 0x40000000 +pd_65538: times 2 dd 65538 + +put_bilin_h_rnd: times 4 dw 8 + times 4 dw 10 +s_8tap_h_rnd: times 2 dd 2 + times 2 dd 8 +put_s_8tap_v_rnd: times 2 dd 512 + times 2 dd 128 +s_8tap_h_sh: dd 2, 4 +put_s_8tap_v_sh: dd 10, 8 +bidir_rnd: times 4 dw -16400 + times 4 dw -16388 +put_8tap_h_rnd: dd 34, 34, 40, 40 +prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) +prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) + +warp8x8_shift: dd 11, 13 +warp8x8_rnd1: dd 1024, 1024, 4096, 4096 +warp8x8_rnd2: times 4 dw 4096 + times 4 dw 16384 +warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) +%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) + +BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 + +%macro SCALED_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) +%%table: + %rep %0 - 2 + dw %%base %+ .w%3 - %%base + %rotate 1 + %endrep + %rotate 2 +%%dy_1024: + %xdefine %1_%2_dy1_table (%%dy_1024 - %3) + %rep %0 - 2 + dw %%base %+ .dy1_w%3 - %%base + %rotate 1 + %endrep + %rotate 2 +%%dy_2048: + %xdefine %1_%2_dy2_table (%%dy_2048 - %3) + %rep %0 - 2 + dw %%base %+ .dy2_w%3 - %%base + %rotate 1 + %endrep +%endmacro + +SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +cextern mc_warp_filter +cextern resize_filter + +SECTION .text + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +INIT_XMM ssse3 +cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy +%define base t0-put_ssse3 + mov mxyd, r6m ; mx + LEA t0, put_ssse3 + movifnidn wd, wm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + tzcnt wd, wd + movzx wd, word [base+put_ssse3_table+wq*2] + add wq, t0 + movifnidn hd, hm + jmp wq +.put_w2: + mov r4d, [srcq+ssq*0] + mov r6d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r4d + mov [dstq+dsq*1], r6d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq [dstq+dsq*0], m0 + movq [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu m0, [srcq+ssq*0+16*0] + movu m1, [srcq+ssq*0+16*1] + movu m2, [srcq+ssq*1+16*0] + movu m3, [srcq+ssq*1+16*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+16*0], m0 + mova [dstq+dsq*0+16*1], m1 + mova [dstq+dsq*1+16*0], m2 + mova [dstq+dsq*1+16*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, ssq + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + add dstq, dsq + dec hd + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, ssq + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add dstq, dsq + dec hd + jg .put_w64 + RET +.put_w128: + add srcq, 16*8 + add dstq, 16*8 +.put_w128_loop: + movu m0, [srcq-16*8] + movu m1, [srcq-16*7] + movu m2, [srcq-16*6] + movu m3, [srcq-16*5] + mova [dstq-16*8], m0 + mova [dstq-16*7], m1 + mova [dstq-16*6], m2 + mova [dstq-16*5], m3 + movu m0, [srcq-16*4] + movu m1, [srcq-16*3] + movu m2, [srcq-16*2] + movu m3, [srcq-16*1] + mova [dstq-16*4], m0 + mova [dstq-16*3], m1 + mova [dstq-16*2], m2 + mova [dstq-16*1], m3 + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, ssq + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add dstq, dsq + dec hd + jg .put_w128_loop + RET +.h: + movd m5, mxyd + mov mxyd, r7m ; my + mova m4, [base+pw_16] + pshufb m5, [base+pw_256] + psubw m4, m5 + test mxyd, mxyd + jnz .hv + ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v + mov r6d, r8m ; bitdepth_max + shr r6d, 11 + movddup m3, [base+put_bilin_h_rnd+r6*8] + movifnidn hd, hm + sub wd, 8 + jg .h_w16 + je .h_w8 + cmp wd, -4 + je .h_w4 +.h_w2: + movq m1, [srcq+ssq*0] + movhps m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmullw m0, m4, m1 + psrlq m1, 16 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + movd [dstq+dsq*0], m0 + punpckhqdq m0, m0 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + movq m0, [srcq+ssq*0] + movhps m0, [srcq+ssq*1] + movq m1, [srcq+ssq*0+2] + movhps m1, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 4 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + neg wq +.h_w16_loop0: + mov r6, wq +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + movu m1, [srcq+r6*2+16] + movu m2, [srcq+r6*2+18] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m0, 4 + psrlw m1, 4 + mova [dstq+r6*2+16*0], m0 + mova [dstq+r6*2+16*1], m1 + add r6, 16 + jl .h_w16_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w16_loop0 + RET +.v: + shl mxyd, 11 + movd m5, mxyd + pshufb m5, [base+pw_256] + movifnidn hd, hm + cmp wd, 4 + jg .v_w8 + je .v_w4 +.v_w2: + movd m0, [srcq+ssq*0] +.v_w2_loop: + movd m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq m2, m0, m1 + movd m0, [srcq+ssq*0] + punpcklqdq m1, m0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + movd [dstq+dsq*0], m1 + punpckhqdq m1, m1 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movq m0, [srcq+ssq*0] +.v_w4_loop: + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklqdq m2, m0, m1 + movq m0, [srcq+ssq*0] + punpcklqdq m1, m0 + psubw m1, m2 + pmulhrsw m1, m5 + paddw m1, m2 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: +%if ARCH_X86_64 +%if WIN64 + push r7 +%endif + shl wd, 5 + mov r7, srcq + lea r6d, [wq+hq-256] + mov r4, dstq +%else + mov r6, srcq +%endif +.v_w8_loop0: + movu m0, [srcq+ssq*0] +.v_w8_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + psubw m1, m3, m0 + pmulhrsw m1, m5 + paddw m1, m0 + movu m0, [srcq+ssq*0] + psubw m2, m0, m3 + pmulhrsw m2, m5 + paddw m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop +%if ARCH_X86_64 + add r7, 16 + add r4, 16 + movzx hd, r6b + mov srcq, r7 + mov dstq, r4 + sub r6d, 1<<8 +%else + mov dstq, dstmp + add r6, 16 + mov hd, hm + add dstq, 16 + mov srcq, r6 + mov dstmp, dstq + sub wd, 8 +%endif + jg .v_w8_loop0 +%if WIN64 + pop r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 8 + shl mxyd, 11 + mova m3, [base+pw_2] + movd m6, mxyd + mova m7, [base+pw_8192] + pshufb m6, [base+pw_256] + test dword r8m, 0x800 + jnz .hv_12bpc + psllw m4, 2 + psllw m5, 2 + mova m7, [base+pw_2048] +.hv_12bpc: + movifnidn hd, hm + cmp wd, 4 + jg .hv_w8 + je .hv_w4 +.hv_w2: + movddup m0, [srcq+ssq*0] + pshufhw m1, m0, q0321 + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w2_loop: + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps m2, [srcq+ssq*0] + pmullw m1, m4, m2 + psrlq m2, 16 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 _ 2 _ + shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + movd [dstq+dsq*0], m1 + punpckhqdq m1, m1 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + movddup m0, [srcq+ssq*0] + movddup m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w4_loop: + movq m1, [srcq+ssq*1] + movq m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + movhps m1, [srcq+ssq*0] + movhps m2, [srcq+ssq*0+2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 ; 1 2 + shufpd m2, m0, m1, 0x01 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: +%if ARCH_X86_64 +%if WIN64 + push r7 +%endif + shl wd, 5 + lea r6d, [wq+hq-256] + mov r4, srcq + mov r7, dstq +%else + mov r6, srcq +%endif +.hv_w8_loop0: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m3 + paddw m0, m1 + psrlw m0, 2 +.hv_w8_loop: + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*1+2] + lea srcq, [srcq+ssq*2] + pmullw m1, m4 + pmullw m2, m5 + paddw m1, m3 + paddw m1, m2 + psrlw m1, 2 + psubw m2, m1, m0 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m0 + pmulhrsw m2, m7 + mova [dstq+dsq*0], m2 + movu m0, [srcq+ssq*0] + movu m2, [srcq+ssq*0+2] + pmullw m0, m4 + pmullw m2, m5 + paddw m0, m3 + paddw m0, m2 + psrlw m0, 2 + psubw m2, m0, m1 + paddw m2, m2 + pmulhw m2, m6 + paddw m2, m1 + pmulhrsw m2, m7 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop +%if ARCH_X86_64 + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 +%else + mov dstq, dstmp + add r6, 16 + mov hd, hm + add dstq, 16 + mov srcq, r6 + mov dstmp, dstq + sub wd, 8 +%endif + jg .hv_w8_loop0 +%if WIN64 + pop r7 +%endif + RET + +cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 +%define base r6-prep_ssse3 + movifnidn mxyd, r5m ; mx + LEA r6, prep_ssse3 + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + tzcnt wd, wd + movzx wd, word [base+prep_ssse3_table+wq*2] + mov r5d, r7m ; bitdepth_max + mova m5, [base+pw_8192] + add wq, r6 + shr r5d, 11 + movddup m4, [base+prep_mul+r5*8] + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*2] + movhps m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmullw m0, m4 + pmullw m1, m4 + psubw m0, m5 + psubw m1, m5 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + movu m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] + movu m2, [srcq+strideq*1+16*0] + movu m3, [srcq+strideq*1+16*1] + lea srcq, [srcq+strideq*2] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 2 + jg .prep_w16 + RET +.prep_w32: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + dec hd + jg .prep_w32 + RET +.prep_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*4], m0 + mova [tmpq+16*5], m1 + mova [tmpq+16*6], m2 + mova [tmpq+16*7], m3 + add tmpq, 16*8 + dec hd + jg .prep_w64 + RET +.prep_w128: + movu m0, [srcq+16* 0] + movu m1, [srcq+16* 1] + movu m2, [srcq+16* 2] + movu m3, [srcq+16* 3] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + movu m0, [srcq+16* 4] + movu m1, [srcq+16* 5] + movu m2, [srcq+16* 6] + movu m3, [srcq+16* 7] + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq+16*4], m0 + mova [tmpq+16*5], m1 + mova [tmpq+16*6], m2 + mova [tmpq+16*7], m3 + movu m0, [srcq+16* 8] + movu m1, [srcq+16* 9] + movu m2, [srcq+16*10] + movu m3, [srcq+16*11] + add tmpq, 16*16 + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq-16*8], m0 + mova [tmpq-16*7], m1 + mova [tmpq-16*6], m2 + mova [tmpq-16*5], m3 + movu m0, [srcq+16*12] + movu m1, [srcq+16*13] + movu m2, [srcq+16*14] + movu m3, [srcq+16*15] + add srcq, strideq + REPX {pmullw x, m4}, m0, m1, m2, m3 + REPX {psubw x, m5}, m0, m1, m2, m3 + mova [tmpq-16*4], m0 + mova [tmpq-16*3], m1 + mova [tmpq-16*2], m2 + mova [tmpq-16*1], m3 + dec hd + jg .prep_w128 + RET +.h: + movd m4, mxyd + mov mxyd, r6m ; my + mova m3, [base+pw_16] + pshufb m4, [base+pw_256] + mova m5, [base+pw_32766] + psubw m3, m4 + test dword r7m, 0x800 + jnz .h_12bpc + psllw m3, 2 + psllw m4, 2 +.h_12bpc: + test mxyd, mxyd + jnz .hv + sub wd, 8 + je .h_w8 + jg .h_w16 +.h_w4: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*0+2] + movhps m1, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + lea srcq, [srcq+wq*2] + neg wq +.h_w16_loop0: + mov r6, wq +.h_w16_loop: + movu m0, [srcq+r6*2+ 0] + movu m1, [srcq+r6*2+ 2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + movu m1, [srcq+r6*2+16] + movu m2, [srcq+r6*2+18] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + add r6, 16 + jl .h_w16_loop + add srcq, strideq + dec hd + jg .h_w16_loop0 + RET +.v: + movd m4, mxyd + mova m3, [base+pw_16] + pshufb m4, [base+pw_256] + mova m5, [base+pw_32766] + psubw m3, m4 + test dword r7m, 0x800 + jnz .v_12bpc + psllw m3, 2 + psllw m4, 2 +.v_12bpc: + cmp wd, 8 + je .v_w8 + jg .v_w16 +.v_w4: + movq m0, [srcq+strideq*0] +.v_w4_loop: + movq m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklqdq m1, m0, m2 ; 0 1 + movq m0, [srcq+strideq*0] + punpcklqdq m2, m0 ; 1 2 + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq], m1 + add tmpq, 16 + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movu m0, [srcq+strideq*0] +.v_w8_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4, m2 + psubw m0, m5 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m3 + mova [tmpq+16*0], m1 + pmullw m1, m4, m0 + psubw m2, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+16*1], m1 + add tmpq, 16*2 + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +%if WIN64 + push r7 +%endif + mov r5, srcq +%if ARCH_X86_64 + lea r6d, [wq*4-32] + mov wd, wd + lea r6d, [hq+r6*8] + mov r7, tmpq +%else + mov r6d, wd +%endif +.v_w16_loop0: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + pmullw m0, m3 + pmullw m1, m4, m2 + psubw m0, m5 + paddw m1, m0 + movu m0, [srcq+strideq*0] + psraw m1, 2 + pmullw m2, m3 + mova [tmpq+wq*0], m1 + pmullw m1, m4, m0 + psubw m2, m5 + paddw m1, m2 + psraw m1, 2 + mova [tmpq+wq*2], m1 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w16_loop +%if ARCH_X86_64 + add r5, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 +%else + mov tmpq, tmpmp + add r5, 16 + mov hd, hm + add tmpq, 16 + mov srcq, r5 + mov tmpmp, tmpq + sub r6d, 8 +%endif + jg .v_w16_loop0 +%if WIN64 + pop r7 +%endif + RET +.hv: + WIN64_SPILL_XMM 7 + shl mxyd, 11 + movd m6, mxyd + pshufb m6, [base+pw_256] + cmp wd, 8 + je .hv_w8 + jg .hv_w16 +.hv_w4: + movddup m0, [srcq+strideq*0] + movddup m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w4_loop: + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + movhps m1, [srcq+strideq*0] + movhps m2, [srcq+strideq*0+2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 ; 1 2 + shufpd m2, m0, m1, 0x01 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 16 + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w8_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+16*0], m2 + movu m0, [srcq+strideq*0] + movu m2, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m2, m4 + psubw m0, m5 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: +%if WIN64 + push r7 +%endif + mov r5, srcq +%if ARCH_X86_64 + lea r6d, [wq*4-32] + mov wd, wd + lea r6d, [hq+r6*8] + mov r7, tmpq +%else + mov r6d, wd +%endif +.hv_w16_loop0: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m1, m4 + psubw m0, m5 + paddw m0, m1 + psraw m0, 2 +.hv_w16_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*1+2] + lea srcq, [srcq+strideq*2] + pmullw m1, m3 + pmullw m2, m4 + psubw m1, m5 + paddw m1, m2 + psraw m1, 2 + psubw m2, m1, m0 + pmulhrsw m2, m6 + paddw m2, m0 + mova [tmpq+wq*0], m2 + movu m0, [srcq+strideq*0] + movu m2, [srcq+strideq*0+2] + pmullw m0, m3 + pmullw m2, m4 + psubw m0, m5 + paddw m0, m2 + psraw m0, 2 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+wq*2], m2 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w16_loop +%if ARCH_X86_64 + add r5, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 +%else + mov tmpq, tmpmp + add r5, 16 + mov hd, hm + add tmpq, 16 + mov srcq, r5 + mov tmpmp, tmpq + sub r6d, 8 +%endif + jg .hv_w16_loop0 +%if WIN64 + pop r7 +%endif + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; prefix, type, type_h, type_v +cglobal %1_%2_16bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) +%endif +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 1, 2, 6 +%elif WIN64 +DECLARE_REG_TMP 4, 5, 8 +%else +DECLARE_REG_TMP 7, 8, 8 +%endif + +%define PUT_8TAP_FN FN put_8tap, +PUT_8TAP_FN sharp, SHARP, SHARP +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN regular, REGULAR, REGULAR + +%if ARCH_X86_32 +cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my +%define mxb r0b +%define mxd r0 +%define mxq r0 +%define myb r1b +%define myd r1 +%define myq r1 +%define m8 [esp+16*0] +%define m9 [esp+16*1] +%define m10 [esp+16*2] +%define m11 [esp+16*3] +%define m12 [esp+16*4] +%define m13 [esp+16*5] +%define m14 [esp+16*6] +%define m15 [esp+16*7] +%else +cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my +%endif +%define base t2-put_ssse3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, put_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + movifnidn ssq, ssmp + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [base+put_ssse3_table+wq*2] + movifnidn dstq, dstmp + movifnidn dsq, dsmp + add wq, t2 +%if WIN64 + pop r8 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + mov myd, r8m + movd m5, r8m + shr myd, 11 + movddup m4, [base+put_8tap_h_rnd+myq*8] + movifnidn dsq, dsmp + pshufb m5, [base+pw_256] + cmp wd, 4 + jg .h_w8 + movzx mxd, mxb + lea srcq, [srcq-2] + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + je .h_w4 +.h_w2: + mova m2, [base+spel_h_shuf2] + pshufd m3, m3, q2121 +.h_w2_loop: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m2 + pshufb m1, m2 + pmaddwd m0, m3 + pmaddwd m1, m3 + phaddd m0, m1 + paddd m0, m4 + psrad m0, 6 + packssdw m0, m0 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + movd [dstq+dsq*0], m0 + pshuflw m0, m0, q3232 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + WIN64_SPILL_XMM 8 + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] + pshufd m2, m3, q1111 + pshufd m3, m3, q2222 +.h_w4_loop: + movu m1, [srcq] + add srcq, ssq + pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 + pshufb m1, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m2 + pmaddwd m1, m3 + paddd m0, m4 + paddd m0, m1 + psrad m0, 6 + packssdw m0, m0 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + movq [dstq], m0 + add dstq, dsq + dec hd + jg .h_w4_loop + RET +.h_w8: +%if WIN64 + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 12 +%endif + shr mxd, 16 + movq m3, [base+subpel_filters+mxq*8] + movifnidn dstq, dstmp + mova m6, [base+spel_h_shufA] + mova m7, [base+spel_h_shufB] +%if UNIX64 + mov wd, wd +%endif + lea srcq, [srcq+wq*2] + punpcklbw m3, m3 + lea dstq, [dstq+wq*2] + psraw m3, 8 + neg wq +%if ARCH_X86_32 + ALLOC_STACK -16*4 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6*2- 6] + movu m1, [srcq+r6*2+ 2] + pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 + pshufb m0, m7 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m8 ; abcd0 + pmaddwd m0, m9 ; abcd1 + pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 + pshufb m1, m7 ; 6 7 7 8 8 9 9 a + paddd m2, m4 + paddd m0, m2 + pmaddwd m2, m10, m3 ; abcd2 + pmaddwd m3, m8 ; efgh0 + paddd m0, m2 + pmaddwd m2, m11, m1 ; abcd3 + pmaddwd m1, m9 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6*2+10] + paddd m3, m4 + paddd m1, m3 + pshufb m3, m2, m6 ; 8 9 9 a a b b c + pshufb m2, m7 ; a b b c c d d e + pmaddwd m3, m10 ; efgh2 + pmaddwd m2, m11 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 6 + psrad m1, 6 + packssdw m0, m1 + pxor m1, m1 + pminsw m0, m5 + pmaxsw m0, m1 + mova [dstq+r6*2], m0 + add r6, 8 + jl .h_w8_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif +%if WIN64 + WIN64_SPILL_XMM 15 +%endif + movd m7, r8m + movifnidn dstq, dstmp + movifnidn dsq, dsmp + punpcklbw m3, m3 + pshufb m7, [base+pw_256] + psraw m3, 8 ; sign-extend +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + cmp wd, 2 + jne .v_w4 +.v_w2: + movd m1, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + movd m2, [srcq+ssq*2] + add srcq, r6 + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m6, [srcq+ssq*2] + add srcq, r6 + movd m0, [srcq+ssq*0] + punpckldq m1, m4 ; 0 1 + punpckldq m4, m2 ; 1 2 + punpckldq m2, m5 ; 2 3 + punpckldq m5, m3 ; 3 4 + punpckldq m3, m6 ; 4 5 + punpckldq m6, m0 ; 5 6 + punpcklwd m1, m4 ; 01 12 + punpcklwd m2, m5 ; 23 34 + punpcklwd m3, m6 ; 45 56 + pxor m6, m6 +.v_w2_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m8, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m9 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m10 ; a2 b2 + paddd m5, m3 + punpckldq m3, m0, m4 ; 6 7 + movd m0, [srcq+ssq*0] + punpckldq m4, m0 ; 7 8 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m11, m3 ; a3 b3 + paddd m5, m4 + psrad m5, 5 + packssdw m5, m5 + pmaxsw m5, m6 + pavgw m5, m6 + pminsw m5, m7 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: +%if ARCH_X86_32 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcmp, srcq +%endif + lea wd, [wq+hq-(1<<16)] +%else + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] +%endif +.v_w4_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + movq m3, [srcq+ssq*2] + add srcq, r6 + movq m4, [srcq+ssq*0] + movq m5, [srcq+ssq*1] + movq m6, [srcq+ssq*2] + add srcq, r6 + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_w4_loop_start +.v_w4_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_w4_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m3 + psrad m1, 5 + psrad m2, 5 + packssdw m1, m2 + pxor m2, m2 + pmaxsw m1, m2 + pavgw m1, m2 + pminsw m1, m7 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*29] + mov dstq, [esp+4*30] + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov [esp+4*29], srcq + mov [esp+4*30], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + movzx hd, ww + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + sub wd, 1<<16 +%else +.v_w4_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m14 + psrad m12, 5 + psrad m13, 5 + packssdw m12, m13 + pxor m13, m13 + pmaxsw m12, m13 + pavgw m12, m13 + pminsw m12, m7 + movq [dstq+dsq*0], m12 + movhps [dstq+dsq*1], m12 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .v_w4_loop0 + RET +.hv: +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif +%if ARCH_X86_32 + movd m4, r8m + mova m6, [base+pd_512] + pshufb m4, [base+pw_256] +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + movd m15, r8m + pshufb m15, [base+pw_256] +%endif + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + je .hv_w4 + movq m0, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + mov dstq, dstmp + mov dsq, dsmp + mova m5, [base+spel_h_shuf2] + ALLOC_STACK -16*8 +%else + mova m6, [base+pd_512] + mova m9, [base+spel_h_shuf2] +%endif + pshuflw m0, m0, q2121 + pxor m7, m7 + punpcklbw m7, m0 + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r8m, 0x800 + jz .hv_w2_10bpc + psraw m7, 2 + psllw m3, 2 +.hv_w2_10bpc: + lea r6, [ssq*3] + sub srcq, 2 + sub srcq, r6 +%if ARCH_X86_32 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m9, m5 + mova m11, m0 + mova m12, m1 + mova m13, m2 + mova m14, m3 + mova m15, m4 +%else + pshufd m11, m3, q0000 + pshufd m12, m3, q1111 + pshufd m13, m3, q2222 + pshufd m14, m3, q3333 +%endif + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + movu m1, [srcq+ssq*2] + add srcq, r6 + movu m4, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m2, m3, m1, m4 +%else + REPX {pshufb x, m9}, m2, m3, m1, m4 +%endif + REPX {pmaddwd x, m7}, m2, m3, m1, m4 + phaddd m2, m3 ; 0 1 + phaddd m1, m4 ; 2 3 + movu m3, [srcq+ssq*1] + movu m4, [srcq+ssq*2] + add srcq, r6 + movu m0, [srcq+ssq*0] +%if ARCH_X86_32 + REPX {pshufb x, m5}, m3, m4, m0 +%else + REPX {pshufb x, m9}, m3, m4, m0 +%endif + REPX {pmaddwd x, m7}, m3, m4, m0 + phaddd m3, m4 ; 4 5 + phaddd m0, m0 ; 6 6 + REPX {paddd x, m6}, m2, m1, m3, m0 + REPX {psrad x, 10}, m2, m1, m3, m0 + packssdw m2, m1 ; 0 1 2 3 + packssdw m3, m0 ; 4 5 6 _ + palignr m4, m3, m2, 4 ; 1 2 3 4 + pshufd m5, m3, q0321 ; 5 6 _ _ + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + punpcklwd m3, m5 ; 45 56 +.hv_w2_loop: + movu m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movu m5, [srcq+ssq*0] + pshufb m4, m9 + pshufb m5, m9 + pmaddwd m4, m7 + pmaddwd m5, m7 + phaddd m4, m5 + pmaddwd m5, m11, m1 ; a0 b0 + mova m1, m2 + pmaddwd m2, m12 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m13 ; a2 b2 + paddd m5, m3 + paddd m4, m6 + psrad m4, 10 ; 7 8 + packssdw m0, m4 + pshufd m3, m0, q2103 + punpckhwd m3, m0 ; 67 78 + mova m0, m4 + pmaddwd m4, m14, m3 ; a3 b3 + paddd m5, m6 + paddd m5, m4 + psrad m5, 10 + packssdw m5, m5 + pxor m4, m4 + pminsw m5, m15 + pmaxsw m5, m4 + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q3232 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w8: + shr mxd, 16 +.hv_w4: + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovb myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + mov dstq, dstmp + mov dsq, dsmp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + ALLOC_STACK -16*15 + mova m8, m0 + mova m9, m1 + mova m14, m6 +%else + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m3, 8 + test dword r8m, 0x800 + jz .hv_w4_10bpc + psraw m0, 2 + psllw m3, 2 +.hv_w4_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 +%if ARCH_X86_32 + %define tmp esp+16*8 + shl wd, 14 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcmp, srcq +%endif + mova [tmp+16*5], m4 + lea wd, [wq+hq-(1<<16)] + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-104 ; red zone +%endif + shl wd, 6 + mov r7, srcq + mov r8, dstq + lea wd, [wq+hq-(1<<8)] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 + mova [tmp+16*5], m15 +%endif + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] + pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 + pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 + pmaddwd m%3, m10 + pmaddwd m%1, m11 + paddd m%3, %5 + paddd m%1, m%3 + pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 + pshufb m%2, m9 ; 6 7 7 8 8 9 9 a + pmaddwd m%3, m12 + pmaddwd m%2, m13 + paddd m%1, m%3 + paddd m%1, m%2 + psrad m%1, %4 +%endmacro +.hv_w4_loop0: +%if ARCH_X86_64 + mova m14, [pd_512] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + movu m6, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 4, 1, 0, 10 + PUT_8TAP_HV_H 5, 2, 0, 10 + PUT_8TAP_HV_H 6, 3, 0, 10 + movu m7, [srcq+ssq*0+0] + movu m2, [srcq+ssq*0+8] + movu m1, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + PUT_8TAP_HV_H 7, 2, 0, 10 + PUT_8TAP_HV_H 1, 3, 0, 10 + movu m2, [srcq+ssq*2+0] + movu m3, [srcq+ssq*2+8] + add srcq, r6 + PUT_8TAP_HV_H 2, 3, 0, 10 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 10 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_w4_loop_start +.hv_w4_loop: + mova m1, [tmp+16*6] + mova m2, m15 +.hv_w4_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*6], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 9 + psrad m2, 9 + packssdw m1, m2 + pxor m7, m7 + pmaxsw m1, m7 + pavgw m7, m1 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop +%if STACK_ALIGNMENT < 16 + mov srcq, [esp+4*61] + mov dstq, [esp+4*62] + add srcq, 8 + add dstq, 8 + mov [esp+4*61], srcq + mov [esp+4*62], dstq +%else + mov srcq, srcmp + mov dstq, dstmp + add srcq, 8 + add dstq, 8 + mov srcmp, srcq + mov dstmp, dstq +%endif + movzx hd, ww + sub wd, 1<<16 +%else +.hv_w4_loop: + mova m15, [tmp+16*1] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 9 + psrad m15, 9 + packssdw m14, m15 + pxor m7, m7 + pmaxsw m14, m7 + pavgw m7, m14 + pminsw m7, [tmp+16*5] + movq [dstq+dsq*0], m7 + movhps [dstq+dsq*1], m7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + add r7, 8 + add r8, 8 + movzx hd, wb + mov srcq, r7 + mov dstq, r8 + sub wd, 1<<8 +%endif + jg .hv_w4_loop0 + RET +%undef tmp + +%if ARCH_X86_32 +DECLARE_REG_TMP 2, 1, 6, 4 +%elif WIN64 +DECLARE_REG_TMP 6, 4, 7, 4 +%else +DECLARE_REG_TMP 6, 7, 7, 8 +%endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR + +%if ARCH_X86_32 +cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my +%define mxb r0b +%define mxd r0 +%define mxq r0 +%define myb r2b +%define myd r2 +%define myq r2 +%else +cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my +%endif +%define base t2-prep_ssse3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + LEA t2, prep_ssse3 + movifnidn wd, wm + movifnidn srcq, srcmp + test mxd, 0xf00 + jnz .h + movifnidn hd, hm + test myd, 0xf00 + jnz .v + tzcnt wd, wd + mov myd, r7m ; bitdepth_max + movzx wd, word [base+prep_ssse3_table+wq*2] + mova m5, [base+pw_8192] + shr myd, 11 + add wq, t2 + movddup m4, [base+prep_mul+myq*8] + movifnidn ssq, ssmp + movifnidn tmpq, tmpmp + lea r6, [ssq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + movifnidn ssq, r2mp + movifnidn hd, r4m + movddup m5, [base+prep_8tap_1d_rnd] + cmp wd, 4 + jne .h_w8 + movzx mxd, mxb + movq m0, [base+subpel_filters+mxq*8] + mova m3, [base+spel_h_shufA] + mova m4, [base+spel_h_shufB] + movifnidn tmpq, tmpmp + sub srcq, 2 + WIN64_SPILL_XMM 8 + punpcklbw m0, m0 + psraw m0, 8 + test dword r7m, 0x800 + jnz .h_w4_12bpc + psllw m0, 2 +.h_w4_12bpc: + pshufd m6, m0, q1111 + pshufd m7, m0, q2222 +.h_w4_loop: + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 + pshufb m1, m4 ; 2 3 3 4 4 5 5 6 + pmaddwd m0, m6 + pmaddwd m1, m7 + paddd m0, m5 + paddd m0, m1 + pshufb m1, m2, m3 + pshufb m2, m4 + pmaddwd m1, m6 + pmaddwd m2, m7 + paddd m1, m5 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq], m0 + add tmpq, 16 + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + WIN64_SPILL_XMM 11 + shr mxd, 16 + movq m2, [base+subpel_filters+mxq*8] + mova m4, [base+spel_h_shufA] + mova m6, [base+spel_h_shufB] + movifnidn tmpq, r0mp + add wd, wd + punpcklbw m2, m2 + add srcq, wq + psraw m2, 8 + add tmpq, wq + neg wq + test dword r7m, 0x800 + jnz .h_w8_12bpc + psllw m2, 2 +.h_w8_12bpc: + pshufd m7, m2, q0000 +%if ARCH_X86_32 + ALLOC_STACK -16*3 + pshufd m0, m2, q1111 + pshufd m1, m2, q2222 + pshufd m2, m2, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 +%else + pshufd m8, m2, q1111 + pshufd m9, m2, q2222 + pshufd m10, m2, q3333 +%endif +.h_w8_loop0: + mov r6, wq +.h_w8_loop: + movu m0, [srcq+r6- 6] + movu m1, [srcq+r6+ 2] + pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 + pshufb m0, m6 ; 2 3 3 4 4 5 5 6 + pmaddwd m2, m7 ; abcd0 + pmaddwd m0, m8 ; abcd1 + pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 + pshufb m1, m6 ; 6 7 7 8 8 9 9 a + paddd m2, m5 + paddd m0, m2 + pmaddwd m2, m9, m3 ; abcd2 + pmaddwd m3, m7 ; efgh0 + paddd m0, m2 + pmaddwd m2, m10, m1 ; abcd3 + pmaddwd m1, m8 ; efgh1 + paddd m0, m2 + movu m2, [srcq+r6+10] + paddd m3, m5 + paddd m1, m3 + pshufb m3, m2, m4 ; a b b c c d d e + pshufb m2, m6 ; 8 9 9 a a b b c + pmaddwd m3, m9 ; efgh2 + pmaddwd m2, m10 ; efgh3 + paddd m1, m3 + paddd m1, m2 + psrad m0, 4 + psrad m1, 4 + packssdw m0, m1 + mova [tmpq+r6], m0 + add r6, 16 + jl .h_w8_loop + add srcq, ssq + sub tmpq, wq + dec hd + jg .h_w8_loop0 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + WIN64_SPILL_XMM 15 + movddup m7, [base+prep_8tap_1d_rnd] + movifnidn ssq, r2mp + movifnidn tmpq, r0mp + punpcklbw m3, m3 + psraw m3, 8 ; sign-extend + test dword r7m, 0x800 + jnz .v_12bpc + psllw m3, 2 +.v_12bpc: +%if ARCH_X86_32 + ALLOC_STACK -16*7 + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 +%else + pshufd m8, m3, q0000 + pshufd m9, m3, q1111 + pshufd m10, m3, q2222 + pshufd m11, m3, q3333 +%endif + lea r6, [ssq*3] + sub srcq, r6 + mov r6d, wd + shl wd, 6 + mov r5, srcq +%if ARCH_X86_64 + mov r7, tmpq +%elif STACK_ALIGNMENT < 16 + mov [esp+4*29], tmpq +%endif + lea wd, [wq+hq-(1<<8)] +.v_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m3, [srcq+ssq*0] + movq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m5, [srcq+ssq*0] + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq m0, [srcq+ssq*0] + punpcklwd m1, m2 ; 01 + punpcklwd m2, m3 ; 12 + punpcklwd m3, m4 ; 23 + punpcklwd m4, m5 ; 34 + punpcklwd m5, m6 ; 45 + punpcklwd m6, m0 ; 56 +%if ARCH_X86_32 + jmp .v_loop_start +.v_loop: + mova m1, m12 + mova m2, m13 + mova m3, m14 +.v_loop_start: + pmaddwd m1, m8 ; a0 + pmaddwd m2, m8 ; b0 + mova m12, m3 + mova m13, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m1, m3 + paddd m2, m4 + mova m14, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m1, m5 + paddd m2, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m3, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m1, m7 + paddd m1, m3 + pmaddwd m3, m11, m6 ; b3 + paddd m2, m7 + paddd m2, m3 + psrad m1, 4 + psrad m2, 4 + packssdw m1, m2 + movq [tmpq+r6*0], m1 + movhps [tmpq+r6*2], m1 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .v_loop +%if STACK_ALIGNMENT < 16 + mov tmpq, [esp+4*29] + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov [esp+4*29], tmpq +%else + mov tmpq, tmpmp + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov tmpmp, tmpq +%endif +%else +.v_loop: + pmaddwd m12, m8, m1 ; a0 + pmaddwd m13, m8, m2 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m9 ; a1 + pmaddwd m4, m9 ; b1 + paddd m12, m3 + paddd m13, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m10 ; a2 + pmaddwd m6, m10 ; b2 + paddd m12, m5 + paddd m13, m6 + movq m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklwd m5, m0, m6 ; 67 + movq m0, [srcq+ssq*0] + pmaddwd m14, m11, m5 ; a3 + punpcklwd m6, m0 ; 78 + paddd m12, m7 + paddd m12, m14 + pmaddwd m14, m11, m6 ; b3 + paddd m13, m7 + paddd m13, m14 + psrad m12, 4 + psrad m13, 4 + packssdw m12, m13 + movq [tmpq+r6*0], m12 + movhps [tmpq+r6*2], m12 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .v_loop + add r5, 8 + add r7, 8 + mov srcq, r5 + mov tmpq, r7 +%endif + movzx hd, wb + sub wd, 1<<8 + jg .v_loop0 + RET +.hv: +%if STACK_ALIGNMENT < 16 + %xdefine rstk rsp +%else + %assign stack_offset stack_offset - stack_size_padded +%endif + movzx t3d, mxb + shr mxd, 16 + cmp wd, 4 + cmove mxd, t3d + movifnidn hd, r4m + movq m2, [base+subpel_filters+mxq*8] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + movq m3, [base+subpel_filters+myq*8] +%if ARCH_X86_32 + mov ssq, r2mp + mov tmpq, r0mp + mova m0, [base+spel_h_shufA] + mova m1, [base+spel_h_shufB] + mova m4, [base+prep_8tap_2d_rnd] + ALLOC_STACK -16*14 + mova m8, m0 + mova m9, m1 + mova m14, m4 +%else +%if WIN64 + ALLOC_STACK 16*6, 16 +%endif + mova m8, [base+spel_h_shufA] + mova m9, [base+spel_h_shufB] +%endif + pxor m0, m0 + punpcklbw m0, m2 + punpcklbw m3, m3 + psraw m0, 4 + psraw m3, 8 + test dword r7m, 0x800 + jz .hv_10bpc + psraw m0, 2 +.hv_10bpc: + lea r6, [ssq*3] + sub srcq, 6 + sub srcq, r6 + mov r6d, wd + shl wd, 6 + mov r5, srcq +%if ARCH_X86_32 + %define tmp esp+16*8 +%if STACK_ALIGNMENT < 16 + mov [esp+4*61], tmpq +%endif + pshufd m1, m0, q0000 + pshufd m2, m0, q1111 + pshufd m5, m0, q2222 + pshufd m0, m0, q3333 + mova m10, m1 + mova m11, m2 + mova m12, m5 + mova m13, m0 +%else +%if WIN64 + %define tmp rsp +%else + %define tmp rsp-88 ; red zone +%endif + mov r7, tmpq + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 +%endif + lea wd, [wq+hq-(1<<8)] + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [tmp+16*1], m0 + mova [tmp+16*2], m1 + mova [tmp+16*3], m2 + mova [tmp+16*4], m3 +.hv_loop0: +%if ARCH_X86_64 + mova m14, [prep_8tap_2d_rnd] +%endif + movu m4, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + movu m5, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + movu m6, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+8] + PUT_8TAP_HV_H 4, 1, 0, 6 + PUT_8TAP_HV_H 5, 2, 0, 6 + PUT_8TAP_HV_H 6, 3, 0, 6 + movu m7, [srcq+ssq*1+0] + movu m2, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + movu m1, [srcq+ssq*0+0] + movu m3, [srcq+ssq*0+8] + PUT_8TAP_HV_H 7, 2, 0, 6 + PUT_8TAP_HV_H 1, 3, 0, 6 + movu m2, [srcq+ssq*1+0] + movu m3, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 2, 3, 0, 6 + packssdw m4, m7 ; 0 3 + packssdw m5, m1 ; 1 4 + movu m0, [srcq+ssq*0+0] + movu m1, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 1, 3, 6 + packssdw m6, m2 ; 2 5 + packssdw m7, m0 ; 3 6 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +%if ARCH_X86_32 + jmp .hv_loop_start +.hv_loop: + mova m1, [tmp+16*5] + mova m2, m15 +.hv_loop_start: + mova m7, [tmp+16*1] + pmaddwd m1, m7 ; a0 + pmaddwd m2, m7 ; b0 + mova m7, [tmp+16*2] + mova [tmp+16*5], m3 + pmaddwd m3, m7 ; a1 + mova m15, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m1, m14 + paddd m2, m14 + paddd m1, m3 + paddd m2, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m1, m5 + paddd m2, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 6 + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 6 + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m1, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m2, m7 ; b3 + psrad m1, 6 + psrad m2, 6 + packssdw m1, m2 + movq [tmpq+r6*0], m1 + movhps [tmpq+r6*2], m1 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .hv_loop +%if STACK_ALIGNMENT < 16 + mov tmpq, [esp+4*61] + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov [esp+4*61], tmpq +%else + mov tmpq, tmpmp + add r5, 8 + add tmpq, 8 + mov srcq, r5 + mov tmpmp, tmpq +%endif +%else +.hv_loop: + mova m15, [tmp+16*1] + mova m7, [prep_8tap_2d_rnd] + pmaddwd m14, m15, m1 ; a0 + pmaddwd m15, m2 ; b0 + paddd m14, m7 + paddd m15, m7 + mova m7, [tmp+16*2] + mova m1, m3 + pmaddwd m3, m7 ; a1 + mova m2, m4 + pmaddwd m4, m7 ; b1 + mova m7, [tmp+16*3] + paddd m14, m3 + paddd m15, m4 + mova m3, m5 + pmaddwd m5, m7 ; a2 + mova m4, m6 + pmaddwd m6, m7 ; b2 + paddd m14, m5 + paddd m15, m6 + movu m7, [srcq+ssq*1+0] + movu m5, [srcq+ssq*1+8] + lea srcq, [srcq+ssq*2] + PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] + packssdw m0, m7 ; 6 7 + mova [tmp+16*0], m0 + movu m0, [srcq+ssq*0+0] + movu m5, [srcq+ssq*0+8] + PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] + mova m6, [tmp+16*0] + packssdw m7, m0 ; 7 8 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, [tmp+16*4] + paddd m14, m7 ; a3 + pmaddwd m7, m6, [tmp+16*4] + paddd m15, m7 ; b3 + psrad m14, 6 + psrad m15, 6 + packssdw m14, m15 + movq [tmpq+r6*0], m14 + movhps [tmpq+r6*2], m14 + lea tmpq, [tmpq+r6*4] + sub hd, 2 + jg .hv_loop + add r5, 8 + add r7, 8 + mov srcq, r5 + mov tmpq, r7 +%endif + movzx hd, wb + sub wd, 1<<8 + jg .hv_loop0 + RET +%undef tmp + +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro SAVE_REG 1 + %xdefine r%1_save r%1 + %xdefine r%1q_save r%1q + %xdefine r%1d_save r%1d + %if ARCH_X86_32 + %define r%1m_save [rstk+stack_offset+(%1+1)*4] + %endif +%endmacro + +%macro LOAD_REG 1 + %xdefine r%1 r%1_save + %xdefine r%1q r%1q_save + %xdefine r%1d r%1d_save + %if ARCH_X86_32 + %define r%1m r%1m_save + %endif + %undef r%1d_save + %undef r%1q_save + %undef r%1_save +%endmacro + +%macro REMAP_REG 2-3 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d + %if ARCH_X86_32 + %if %3 == 0 + %xdefine r%1m r%2m + %else + %define r%1m [rstk+stack_offset+(%1+1)*4] + %endif + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %if ARCH_X86_64 + SAVE_REG 14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %else + SAVE_REG 5 + %assign %%i 5 + %rep 5 + %assign %%j %%i-1 + REMAP_REG %%i, %%j, 0 + %assign %%i %%i-1 + %endrep + %endif + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %if ARCH_X86_64 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + LOAD_REG 14 + %else + %rep 4 + %assign %%j %%i+1 + REMAP_REG %%i, %%j, 1 + %assign %%i %%i+1 + %endrep + LOAD_REG 5 + %endif + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%if ARCH_X86_32 + %macro MC_4TAP_SCALED_H 1 ; dst_mem + movu m7, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m5, [r4 +ssq*0] + movu m6, [r4 +ssq*1] + lea srcq, [srcq+ssq*2] + lea r4, [r4 +ssq*2] + REPX {pshufb x, m12}, m7, m2 + REPX {pmaddwd x, m13}, m7, m2 + REPX {pshufb x, m14}, m5, m6 + REPX {pmaddwd x, m15}, m5, m6 + phaddd m7, m5 + phaddd m2, m6 + mova m5, [esp+0x00] + movd m6, [esp+0x10] + paddd m7, m5 + paddd m2, m5 + psrad m7, m6 + psrad m2, m6 + packssdw m7, m2 + mova [stk+%1], m7 + %endmacro +%endif + +%if ARCH_X86_64 + %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] + movu m%1, [srcq+ r4*2] + movu m%2, [srcq+ r6*2] + movu m%3, [srcq+ r7*2] + movu m%4, [srcq+ r9*2] + movu m%5, [srcq+r10*2] + movu m%6, [srcq+r11*2] + movu m%7, [srcq+r13*2] + movu m%8, [srcq+ rX*2] + add srcq, ssq + pmaddwd m%1, [stk+0x10] + pmaddwd m%2, [stk+0x20] + pmaddwd m%3, [stk+0x30] + pmaddwd m%4, [stk+0x40] + pmaddwd m%5, [stk+0x50] + pmaddwd m%6, [stk+0x60] + pmaddwd m%7, [stk+0x70] + pmaddwd m%8, [stk+0x80] + phaddd m%1, m%2 + phaddd m%3, m%4 + phaddd m%5, m%6 + phaddd m%7, m%8 + phaddd m%1, m%3 + phaddd m%5, m%7 + paddd m%1, hround + paddd m%5, hround + psrad m%1, m12 + psrad m%5, m12 + packssdw m%1, m%5 + %endmacro +%else + %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets + %if %3 == 1 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + %endif + movu m0, [srcq+r0*2] + movu m1, [srcq+rX*2] + movu m2, [srcq+r4*2] + movu m3, [srcq+r5*2] + mov r0, [stk+16] + mov rX, [stk+20] + mov r4, [stk+24] + mov r5, [stk+28] + pmaddwd m0, [stk+%1+0x00] + pmaddwd m1, [stk+%1+0x10] + pmaddwd m2, [stk+%1+0x20] + pmaddwd m3, [stk+%1+0x30] + phaddd m0, m1 + phaddd m2, m3 + movu m4, [srcq+r0*2] + movu m5, [srcq+rX*2] + movu m6, [srcq+r4*2] + movu m7, [srcq+r5*2] + add srcq, ssq + pmaddwd m4, [stk+%1+0xa0] + pmaddwd m5, [stk+%1+0xb0] + pmaddwd m6, [stk+%1+0xc0] + pmaddwd m7, [stk+%1+0xd0] + phaddd m4, m5 + phaddd m6, m7 + phaddd m0, m2 + phaddd m4, m6 + paddd m0, hround + paddd m4, hround + psrad m0, m12 + psrad m4, m12 + packssdw m0, m4 + %if %2 != 0 + mova [stk+%2], m0 + %endif + %endmacro +%endif + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isput 1 + %assign isprep 0 + %if ARCH_X86_64 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %else +cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %endif + %else ; ARCH_X86_32 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %else +cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax + %endif + %endif + %xdefine base_reg r12 +%else ; prep + %assign isput 0 + %assign isprep 1 + %if ARCH_X86_64 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %define tmp_stridem qword [stk+0x138] + %endif + %xdefine base_reg r11 + %else ; ARCH_X86_32 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %else +cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax + %endif + %define tmp_stridem dword [stk+0x138] + %endif +%endif +%if ARCH_X86_32 + mov [esp+0x1f0], t0d + mov [esp+0x1f4], t1d + %if isput && required_stack_alignment > STACK_ALIGNMENT + mov dstd, dstm + mov dsd, dsm + mov srcd, srcm + mov ssd, ssm + mov hd, hm + mov r4, mxm + %define r0m [esp+0x200] + %define dsm [esp+0x204] + %define dsmp dsm + %define r1m dsm + %define r2m [esp+0x208] + %define ssm [esp+0x20c] + %define r3m ssm + %define hm [esp+0x210] + %define mxm [esp+0x214] + mov r0m, dstd + mov dsm, dsd + mov r2m, srcd + mov ssm, ssd + mov hm, hd + mov r0, mym + mov r1, dxm + mov r2, dym + %define mym [esp+0x218] + %define dxm [esp+0x21c] + %define dym [esp+0x220] + mov mxm, r4 + mov mym, r0 + mov dxm, r1 + mov dym, r2 + tzcnt wd, wm + %endif + %if isput + mov r3, pxmaxm + %define pxmaxm r3 + %else + mov r2, pxmaxm + %endif + %if isprep && required_stack_alignment > STACK_ALIGNMENT + %xdefine base_reg r5 + %else + %xdefine base_reg r6 + %endif +%endif + LEA base_reg, %1_8tap_scaled_16bpc_ssse3 +%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3 +%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT + tzcnt wd, wm +%endif +%if ARCH_X86_64 + %if isput + mov r7d, pxmaxm + %endif +%else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 +%endif + movd m8, dxm + movd m14, mxm +%if isput + movd m15, pxmaxm +%endif + pshufd m8, m8, q0000 + pshufd m14, m14, q0000 +%if isput + pshuflw m15, m15, q0000 + punpcklqdq m15, m15 +%endif +%if isprep + %if UNIX64 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 + %endif + %if ARCH_X86_64 + mov r6d, pxmaxm + %endif +%endif +%if ARCH_X86_64 + mov dyd, dym +%endif +%if isput + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %elif ARCH_X86_64 + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %else + %endif + %if ARCH_X86_64 + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+0x138] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif + %else + %define rX r1 + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %elif ARCH_X86_64 + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %xdefine hm r7m + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %if ARCH_X86_64 + %define rX r14 + %define rXd r14d + %else + %define rX r3 + %endif +%endif +%if ARCH_X86_64 + shr r7d, 11 + mova m10, [base+pd_0x3ff] + movddup m11, [base+s_8tap_h_rnd+r7*8] + movd m12, [base+s_8tap_h_sh+r7*4] + %if isput + movddup m13, [base+put_s_8tap_v_rnd+r7*8] + movd m7, [base+put_s_8tap_v_sh+r7*4] + %define pxmaxm [rsp] + mova pxmaxm, m15 + punpcklqdq m12, m7 + %endif + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q +%else + %define m10 [base+pd_0x3ff] + %define m11 [esp+0x00] + %define m12 [esp+0x10] + shr r3, 11 + movddup m1, [base+s_8tap_h_rnd+r3*8] + movd m2, [base+s_8tap_h_sh+r3*4] + %if isput + %define m13 [esp+0x20] + %define pxmaxm [esp+0x30] + %define stk esp+0x40 + movddup m5, [base+put_s_8tap_v_rnd+r3*8] + movd m6, [base+put_s_8tap_v_sh+r3*4] + mova pxmaxm, m15 + punpcklqdq m2, m6 + mova m13, m5 + %else + %define m13 [base+pd_m524256] + %endif + mov ssd, ssm + mova m11, m1 + mova m12, m2 + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + mov r1, [esp+0x1f4] + lea r0, [ssd*3] + movzx r2, r1b + shr r1, 16 + cmp dword hm, 6 + cmovs r1, r2 + mov [esp+0x1f4], r1 + %if isprep + mov r1, r1m + %endif + mov r2, r2m + sub srcq, r0 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define ss3q r0 + %define myd r4 + %define dyd dword dym + %define hd dword hm +%endif + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.w2: + %if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d + %else + movzx r4, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r4 + %endif + pxor m9, m9 + punpckldq m9, m8 + paddd m14, m9 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m9, [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + pshufd m15, m15, q0321 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_q] + mova m6, [base+spel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m2, m2 + pcmpeqd m8, m2 + psrld m14, 10 + paddd m14, m14 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [stk], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m15 m6 + %endif + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m7 + %if ARCH_X86_64 + pshufb m14, m5 + paddb m14, m6 + pand m9, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m9 + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + movu m7, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + %else + pand m7, m5, [base+pd_0x4000] + pandn m5, m15 + por m5, m7 + %define m15 m5 + %endif + punpcklbw m15, m15 + psraw m15, 8 + REPX {pshufb x, m14}, m0, m1, m2, m3 + REPX {pmaddwd x, m15}, m0, m1, m2, m3 + %if ARCH_X86_64 + REPX {pshufb x, m14}, m4, m5, m6, m7 + REPX {pmaddwd x, m15}, m4, m5, m6, m7 + phaddd m0, m1 + phaddd m2, m3 + phaddd m4, m5 + phaddd m6, m7 + REPX {paddd x, m11}, m0, m2, m4, m6 + REPX {psrad x, m12}, m0, m2, m4, m6 + packssdw m0, m2 ; 0 1 2 3 + packssdw m4, m6 ; 4 5 6 7 + SWAP m1, m4 + %else + mova [stk+0x10], m15 + phaddd m0, m1 + phaddd m2, m3 + movu m1, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m14}, m1, m7, m6, m3 + REPX {pmaddwd x, m15}, m1, m7, m6, m3 + phaddd m1, m7 + phaddd m6, m3 + REPX {paddd x, m11}, m0, m2, m1, m6 + REPX {psrad x, m12}, m0, m2, m1, m6 + packssdw m0, m2 + packssdw m1, m6 + %define m14 [stk+0x00] + %define m15 [stk+0x10] + %endif + palignr m2, m1, m0, 4 ; 1 2 3 4 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + pshufd m5, m1, q0321 ; 5 6 7 _ + punpcklwd m2, m1, m5 ; 45 56 + punpckhwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mov myd, mym + mov r0, r0m + mova [stk+0x20], m3 + mova [stk+0x30], m0 + mova [stk+0x40], m2 + mova [stk+0x50], m4 + %endif +.w2_loop: + and myd, 0x3ff + %if ARCH_X86_64 + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m10, r6q + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pmaddwd m5, m3, m7 + pmaddwd m6, m0, m8 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddwd m7, m2, m9 + pmaddwd m8, m4, m10 + paddd m5, m6 + paddd m7, m8 + %else + mov r1, [esp+0x1f4] + xor r3, r3 + mov r5, myd + shr r5, 6 + lea r1, [r1+r5] + mov r5, 64 << 24 + cmovnz r3, [base+subpel_filters+r1*8+4] + cmovnz r5, [base+subpel_filters+r1*8+0] + movd m6, r3 + movd m7, r5 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m5, m7, q0000 + pshufd m6, m7, q1111 + pmaddwd m3, m5 + pmaddwd m0, m6 + pshufd m5, m7, q2222 + pshufd m7, m7, q3333 + pmaddwd m2, m5 + pmaddwd m4, m7 + paddd m3, m0 + paddd m2, m4 + SWAP m5, m3 + SWAP m7, m2 + %define m8 m3 + %endif + paddd m5, m13 + pshufd m6, m12, q1032 + pxor m8, m8 + paddd m5, m7 + psrad m5, m6 + packssdw m5, m5 + pmaxsw m5, m8 + pminsw m5, pxmaxm + movd [dstq], m5 + add dstq, dsmp + dec hd + jz .ret + %if ARCH_X86_64 + add myd, dyd + %else + add myd, dym + %endif + test myd, ~0x3ff + %if ARCH_X86_32 + SWAP m3, m5 + SWAP m2, m7 + mova m3, [stk+0x20] + mova m0, [stk+0x30] + mova m2, [stk+0x40] + mova m4, [stk+0x50] + %endif + jz .w2_loop + %if ARCH_X86_32 + mov r3, r3m + %endif + movu m5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps m3, m0, q1032 ; 01 12 + shufps m0, m2, q1032 ; 23 34 + shufps m2, m4, q1032 ; 45 56 + pshufb m5, m14 + pmaddwd m5, m15 + phaddd m5, m5 + paddd m5, m11 + psrad m5, m12 + packssdw m5, m5 + palignr m4, m5, m1, 12 + punpcklqdq m1, m4, m4 ; 6 7 6 7 + punpcklwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mova [stk+0x20], m3 + mova [stk+0x30], m0 + mova [stk+0x40], m2 + mova [stk+0x50], m4 + %endif + jmp .w2_loop +.w2_skip_line: + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m3, m0 ; 01 12 + mova m0, m2 ; 23 34 + pshufb m5, m14 + pshufb m6, m14 + pmaddwd m5, m15 + pmaddwd m6, m15 + phaddd m5, m6 + paddd m5, m11 + psrad m5, m12 + packssdw m5, m5 ; 6 7 6 7 + punpckhqdq m1, m5 ; 4 5 6 7 + pshufd m5, m1, q0321 ; 5 6 7 _ + punpcklwd m2, m1, m5 ; 45 56 + punpckhwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mova [stk+0x20], m3 + mova [stk+0x30], m0 + mova [stk+0x40], m2 + mova [stk+0x50], m4 + %endif + jmp .w2_loop +%endif +INIT_XMM ssse3 +.w4: +%if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %if isput + mova [rsp+0x30], m13 + %endif + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d +%else + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + movzx r4, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r4 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m9, [base+pd_0x4000] +%else + %define m9 [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + pshufd m7, m15, q1032 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r6d, m15 + movd r13d, m7 + mova m10, [base+bdct_lb_q+ 0] + mova m11, [base+bdct_lb_q+16] + movd m13, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+ r6*8+2] + movd m15, [base+subpel_filters+r11*8+2] + movd m4, [base+subpel_filters+r13*8+2] +%else + movd r0, m15 + movd r4, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd rX, m15 + movd r5, m7 + mova m5, [base+bdct_lb_q+ 0] + mova m6, [base+bdct_lb_q+16] + movd m1, [base+subpel_filters+r0*8+2] + movd m2, [base+subpel_filters+rX*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + movifprep r3, r3m + SWAP m4, m7 + %define m10 m5 + %define m11 m6 + %define m12 m1 + %define m13 m1 +%endif + psrld m14, 10 + paddd m14, m14 + punpckldq m13, m2 + punpckldq m15, m4 + punpcklqdq m13, m15 + pxor m2, m2 + pcmpeqd m0, m2 +%if ARCH_X86_64 + pand m9, m0 +%else + pand m2, m9, m0 + %define m9 m2 + SWAP m7, m4 +%endif + pandn m0, m13 +%if ARCH_X86_64 + SWAP m13, m0 +%else + %define m13 m0 +%endif + por m13, m9 + punpckhbw m15, m13, m13 + punpcklbw m13, m13 + psraw m15, 8 + psraw m13, 8 + pshufb m12, m14, m10 + pshufb m14, m11 + mova m10, [base+spel_s_shuf2] + movd r4d, m14 + shr r4d, 24 +%if ARCH_X86_32 + mova [stk+0x20], m13 + mova [stk+0x30], m15 + pxor m2, m2 +%endif + pshufb m7, m14, m2 + psubb m14, m7 + paddb m12, m10 + paddb m14, m10 +%if ARCH_X86_64 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu m7, [srcq+ssq*0] + movu m9, [srcq+ssq*1] + movu m8, [srcq+ssq*2] + movu m10, [srcq+ss3q ] + movu m1, [srcq+r4 ] + movu m3, [srcq+r6 ] + movu m2, [srcq+r11 ] + movu m4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m7, m9, m8, m10 + REPX {pmaddwd x, m13}, m7, m9, m8, m10 + REPX {pshufb x, m14}, m1, m2, m3, m4 + REPX {pmaddwd x, m15}, m1, m2, m3, m4 + mova m5, [rsp+0x10] + movd xm6, [rsp+0x20] + phaddd m7, m1 + phaddd m9, m3 + phaddd m8, m2 + phaddd m10, m4 + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m4, [srcq+ss3q ] + REPX {paddd x, m5}, m7, m9, m8, m10 + REPX {psrad x, xm6}, m7, m9, m8, m10 + packssdw m7, m9 ; 0 1 + packssdw m8, m10 ; 2 3 + movu m0, [srcq+r4 ] + movu m9, [srcq+r6 ] + movu m10, [srcq+r11 ] + movu m11, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m1, m2, m3, m4 + REPX {pmaddwd x, m13}, m1, m2, m3, m4 + REPX {pshufb x, m14}, m0, m9, m10, m11 + REPX {pmaddwd x, m15}, m0, m9, m10, m11 + phaddd m1, m0 + phaddd m2, m9 + phaddd m3, m10 + phaddd m4, m11 + REPX {paddd x, m5}, m1, m2, m3, m4 + REPX {psrad x, xm6}, m1, m2, m3, m4 + packssdw m1, m2 ; 4 5 + packssdw m3, m4 ; 6 7 + SWAP m9, m1 + shufps m4, m7, m8, q1032 ; 1 2 + shufps m5, m8, m9, q1032 ; 3 4 + shufps m6, m9, m3, q1032 ; 5 6 + pshufd m10, m3, q1032 ; 7 _ + punpcklwd m0, m7, m4 ; 01 + punpckhwd m7, m4 ; 12 + punpcklwd m1, m8, m5 ; 23 + punpckhwd m8, m5 ; 34 + punpcklwd m2, m9, m6 ; 45 + punpckhwd m9, m6 ; 56 + punpcklwd m3, m10 ; 67 + mova [rsp+0x40], m7 + mova [rsp+0x50], m8 + mova [rsp+0x60], m9 +%else + mova [stk+0x00], m12 + mova [stk+0x10], m14 + add r4, srcq + MC_4TAP_SCALED_H 0x40 ; 0 1 + MC_4TAP_SCALED_H 0x50 ; 2 3 + MC_4TAP_SCALED_H 0x60 ; 4 5 + MC_4TAP_SCALED_H 0x70 ; 6 7 + mova m4, [stk+0x40] + mova m5, [stk+0x50] + mova m6, [stk+0x60] + mova m7, [stk+0x70] + mov [stk+0xc0], r4 + shufps m1, m4, m5, q1032 ; 1 2 + shufps m2, m5, m6, q1032 ; 3 4 + shufps m3, m6, m7, q1032 ; 5 6 + pshufd m0, m7, q1032 ; 7 _ + mova [stk+0xb0], m0 + punpcklwd m0, m4, m1 ; 01 + punpckhwd m4, m1 ; 12 + punpcklwd m1, m5, m2 ; 23 + punpckhwd m5, m2 ; 34 + punpcklwd m2, m6, m3 ; 45 + punpckhwd m6, m3 ; 56 + punpcklwd m3, m7, [stk+0xb0] ; 67 + mov myd, mym + mov r0, r0m + mova [stk+0x40], m0 ; 01 + mova [stk+0x50], m1 ; 23 + mova [stk+0x60], m2 ; 45 + mova [stk+0x70], m3 ; 67 + mova [stk+0x80], m4 ; 12 + mova [stk+0x90], m5 ; 34 + mova [stk+0xa0], m6 ; 56 + %define m12 [stk+0x00] + %define m14 [stk+0x10] + %define m13 [stk+0x20] + %define m15 [stk+0x30] + %define hrnd_mem [esp+0x00] + %define hsh_mem [esp+0x10] + %if isput + %define vrnd_mem [esp+0x20] + %else + %define vrnd_mem [base+pd_m524256] + %endif +%endif +.w4_loop: + and myd, 0x3ff +%if ARCH_X86_64 + mov r11d, 64 << 24 + mov r13d, myd + shr r13d, 6 + lea r13d, [t1+r13] + cmovnz r11q, [base+subpel_filters+r13*8] + movq m9, r11q + punpcklbw m9, m9 + psraw m9, 8 + pshufd m7, m9, q0000 + pshufd m8, m9, q1111 + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pshufd m7, m9, q2222 + pshufd m9, m9, q3333 + pmaddwd m6, m2, m7 + pmaddwd m8, m3, m9 + %if isput + movd m9, [rsp+0x28] + %define vrnd_mem [rsp+0x30] + %else + %define vrnd_mem [base+pd_m524256] + %endif + paddd m4, m5 + paddd m6, m8 + paddd m4, m6 + paddd m4, vrnd_mem +%else + mov mym, myd + mov r5, [esp+0x1f4] + xor r3, r3 + shr r4, 6 + lea r5, [r5+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + pmaddwd m0, m4 + pmaddwd m1, m5 + pmaddwd m2, m6 + pmaddwd m3, m7 + %if isput + movd m4, [esp+0x18] + %endif + paddd m0, m1 + paddd m2, m3 + paddd m0, vrnd_mem + paddd m0, m2 + SWAP m4, m0 + %define m9 m0 +%endif +%if isput + pxor m5, m5 + psrad m4, m9 + packssdw m4, m4 + pmaxsw m4, m5 + pminsw m4, pxmaxm + movq [dstq], m4 + add dstq, dsmp +%else + psrad m4, 6 + packssdw m4, m4 + movq [tmpq], m4 + add tmpq, 8 +%endif + dec hd + jz .ret +%if ARCH_X86_64 + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + mova m8, [rsp+0x10] + movd m9, [rsp+0x20] + movu m4, [srcq] + movu m5, [srcq+r4] + test myd, 0x400 + jz .w4_skip_line + mova m0, [rsp+0x40] + mova [rsp+0x40], m1 + mova m1, [rsp+0x50] + mova [rsp+0x50], m2 + mova m2, [rsp+0x60] + mova [rsp+0x60], m3 + pshufb m4, m12 + pshufb m5, m14 + pmaddwd m4, m13 + pmaddwd m5, m15 + phaddd m4, m5 + paddd m4, m8 + psrad m4, m9 + packssdw m4, m4 + punpcklwd m3, m10, m4 + mova m10, m4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu m6, [srcq+ssq*1] + movu m7, [srcq+r6] + mova m0, [rsp+0x50] + mova m11, [rsp+0x60] + pshufb m4, m12 + pshufb m6, m12 + pshufb m5, m14 + pshufb m7, m14 + pmaddwd m4, m13 + pmaddwd m6, m13 + pmaddwd m5, m15 + pmaddwd m7, m15 + mova [rsp+0x40], m0 + mova [rsp+0x50], m11 + phaddd m4, m5 + phaddd m6, m7 + paddd m4, m8 + paddd m6, m8 + psrad m4, m9 + psrad m6, m9 + packssdw m4, m6 + punpcklwd m9, m10, m4 + mova [rsp+0x60], m9 + pshufd m10, m4, q1032 + mova m0, m1 + mova m1, m2 + mova m2, m3 + punpcklwd m3, m4, m10 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +%else + SWAP m0, m4 + mov myd, mym + mov r3, r3m + add myd, dym + test myd, ~0x3ff + jnz .w4_next_line + mova m0, [stk+0x40] + mova m1, [stk+0x50] + mova m2, [stk+0x60] + mova m3, [stk+0x70] + jmp .w4_loop +.w4_next_line: + mov r5, [stk+0xc0] + movu m4, [srcq] + movu m5, [r5] + test myd, 0x400 + jz .w4_skip_line + add [stk+0xc0], ssq + mova m0, [stk+0x80] + mova m3, [stk+0x50] + mova [stk+0x40], m0 + mova [stk+0x80], m3 + mova m1, [stk+0x90] + mova m6, [stk+0x60] + mova [stk+0x50], m1 + mova [stk+0x90], m6 + mova m2, [stk+0xa0] + mova m7, [stk+0x70] + mova [stk+0x60], m2 + mova [stk+0xa0], m7 + pshufb m4, m12 + pshufb m5, m14 + pmaddwd m4, m13 + pmaddwd m5, m15 + phaddd m4, m5 + paddd m4, hrnd_mem + psrad m4, hsh_mem + packssdw m4, m4 + punpcklwd m3, [stk+0xb0], m4 + mova [stk+0xb0], m4 + mova [stk+0x70], m3 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu m6, [srcq+ssq*1] + movu m7, [r5 +ssq*1] + lea r5, [r5 +ssq*2] + mov [stk+0xc0], r5 + mova m0, [stk+0x50] + mova m1, [stk+0x60] + mova m2, [stk+0x70] + mova m3, [stk+0x90] + pshufb m4, m12 + pshufb m6, m12 + pshufb m5, m14 + pshufb m7, m14 + pmaddwd m4, m13 + pmaddwd m6, m13 + pmaddwd m5, m15 + pmaddwd m7, m15 + mova [stk+0x40], m0 + mova [stk+0x50], m1 + mova [stk+0x60], m2 + mova [stk+0x80], m3 + phaddd m4, m5 + phaddd m6, m7 + mova m5, [stk+0xa0] + mova m7, [stk+0xb0] + paddd m4, hrnd_mem + paddd m6, hrnd_mem + psrad m4, hsh_mem + psrad m6, hsh_mem + packssdw m4, m6 + punpcklwd m7, m4 + pshufd m6, m4, q1032 + mova [stk+0x90], m5 + mova [stk+0xa0], m7 + mova [stk+0xb0], m6 + punpcklwd m3, m4, m6 + mova [stk+0x70], m3 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +%endif +INIT_XMM ssse3 +%if ARCH_X86_64 + %define stk rsp+0x20 +%endif +.w8: + mov dword [stk+0xf0], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [stk+0xf0], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [stk+0xf0], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [stk+0xf0], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [stk+0xf0], 16 + movifprep tmp_stridem, 256 +.w_start: +%if ARCH_X86_64 + %ifidn %1, put + movifnidn dsm, dsq + %endif + mova [rsp+0x10], m11 + %define hround m11 + shr t0d, 16 + movd m15, t0d + %if isprep + mova m13, [base+pd_m524256] + %endif +%else + %define hround [esp+0x00] + %define m12 [esp+0x10] + %define m10 [base+pd_0x3ff] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq ssm + %endif + mov r4, [esp+0x1f0] + shr r4, 16 + movd m15, r4 + mov r0, r0m + mov myd, mym +%endif + sub srcq, 6 + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + mova [stk+0x100], m7 + mova [stk+0x120], m15 + mov [stk+0x0f8], srcq + mov [stk+0x130], r0q ; dstq / tmpq +%if ARCH_X86_64 && UNIX64 + mov hm, hd +%elif ARCH_X86_32 + mov r5, hm + mov [stk+0x0f4], myd + mov [stk+0x134], r5 +%endif + jmp .hloop +.hloop_prep: + dec dword [stk+0x0f0] + jz .ret +%if ARCH_X86_64 + add qword [stk+0x130], 16 + mov hd, hm +%else + add dword [stk+0x130], 16 + mov myd, [stk+0x0f4] + mov r5, [stk+0x134] + mov r0, [stk+0x130] +%endif + mova m7, [stk+0x100] + mova m14, [stk+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] + mova m11, [rsp+0x10] +%endif + mova m15, [stk+0x120] + mov srcq, [stk+0x0f8] +%if ARCH_X86_64 + mov r0q, [stk+0x130] ; dstq / tmpq +%else + mov mym, myd + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.hloop: +%if ARCH_X86_64 + mova m9, [base+pq_0x40000000] +%else + %define m9 [base+pq_0x40000000] +%endif + pxor m1, m1 + psrld m2, m14, 10 + mova [stk], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m1 + pshufd m2, m5, q1032 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pxor m2, m2 + pcmpeqd m5, m2 + mova [stk+0x110], m14 + pshufd m4, m15, q1032 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + movq r11, m14 + punpckhqdq m14, m14 + movq rX, m14 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m9, m4 + pand m8, m9, m6 + pand m15, m9, m14 + pand m9, m9, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m9, m5 + punpcklbw m0, m7, m7 + punpckhbw m7, m7 + punpcklbw m1, m8, m8 + punpckhbw m8, m8 + psraw m0, 8 + psraw m7, 8 + psraw m1, 8 + psraw m8, 8 + punpcklbw m2, m15, m15 + punpckhbw m15, m15 + punpcklbw m3, m9, m9 + punpckhbw m9, m9 + psraw m2, 8 + psraw m15, 8 + psraw m3, 8 + psraw m9, 8 + mova [stk+0x10], m0 + mova [stk+0x20], m7 + mova [stk+0x30], m1 + mova [stk+0x40], m8 + mova [stk+0x50], m2 + mova [stk+0x60], m15 + mova [stk+0x70], m3 + mova [stk+0x80], m9 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 + mova [stk+0x90], m1 + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 + mova [stk+0xa0], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 + mova [stk+0xb0], m3 + MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 + mova [stk+0xc0], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 + mova [stk+0xd0], m5 + MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 + MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 + MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 + mova m5, [stk+0xd0] + mova m1, [stk+0x90] + mova m2, [stk+0xa0] + mova m3, [stk+0xb0] + mova m9, [stk+0xc0] + mov myd, mym + mov dyd, dym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova [stk+0x90], m4 + mova [stk+0xa0], m5 + mova [stk+0xb0], m6 + mova [stk+0xc0], m7 + %define hround [rsp+0x10] +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m11, r6q + punpcklbw m11, m11 + psraw m11, 8 + pshufd m5, m11, q0000 + pshufd m7, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m4, m5, m0 + pmaddwd m5, m5, m1 + pmaddwd m6, m7, m2 + pmaddwd m7, m7, m3 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [stk+0x90], m10 + pmaddwd m7, [stk+0xa0], m10 + pmaddwd m8, [stk+0xb0], m11 + pmaddwd m9, [stk+0xc0], m11 + paddd m4, m6 + paddd m5, m7 + %if isput + pshufd m6, m12, q1032 + %endif + paddd m4, m8 + paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r4, m15 + movd r5, m4 + mova m14, [stk+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [stk+16], m14 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m9, m4 + pand m1, m9, m6 + pand m2, m9, m7 + pand m3, m9, m5 + pandn m4, [stk+0x20] + pandn m6, [stk+0x30] + pandn m7, [stk+0x40] + pandn m5, [stk+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + punpcklbw m4, m0, m0 + punpckhbw m0, m0 + punpcklbw m5, m1, m1 + punpckhbw m1, m1 + psraw m4, 8 + psraw m0, 8 + psraw m5, 8 + psraw m1, 8 + punpcklbw m6, m2, m2 + punpckhbw m2, m2 + punpcklbw m7, m3, m3 + punpckhbw m3, m3 + psraw m6, 8 + psraw m2, 8 + psraw m7, 8 + psraw m3, 8 + mova [stk+0x0a0], m4 + mova [stk+0x0b0], m0 + mova [stk+0x0c0], m5 + mova [stk+0x0d0], m1 + mova [stk+0x140], m6 + mova [stk+0x150], m2 + mova [stk+0x160], m7 + mova [stk+0x170], m3 + MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 + MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 + MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 + MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 + MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 + MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 + MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 + MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 + mova m5, [stk+0x60] + mova m6, [stk+0x70] + mova m7, [stk+0x80] + mova m0, [stk+0x90] + mov myd, mym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m1, [stk+0x20] + mova m2, [stk+0x30] + mova m3, [stk+0x40] + mova m4, [stk+0x50] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 +.vloop: + mov r0, r0m + mov r5, [esp+0x1f4] + and myd, 0x3ff + mov mym, myd + xor r3, r3 + shr r4, 6 + lea r5, [r5+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [stk+0x60], m6 + pmaddwd m3, [stk+0x70], m6 + pmaddwd m4, [stk+0x80], m7 + pmaddwd m5, [stk+0x90], m7 + %if isput + movd m6, [esp+0x18] + %endif + paddd m0, m2 + paddd m1, m3 + paddd m0, vrnd_mem + paddd m1, vrnd_mem + paddd m4, m0 + paddd m5, m1 +%endif +%ifidn %1, put + psrad m4, m6 + psrad m5, m6 + packssdw m4, m5 + pxor m7, m7 + pmaxsw m4, m7 + pminsw m4, pxmaxm + mova [dstq], m4 + add dstq, dsm +%else + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep +%if ARCH_X86_64 + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [stk+0x140], myd + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + jz .skip_line + mova m14, [base+unpckw] + movu m8, [srcq+r10*2] + movu m9, [srcq+r11*2] + movu m10, [srcq+r13*2] + movu m11, [srcq+ rX*2] + movu m4, [srcq+ r4*2] + movu m5, [srcq+ r6*2] + movu m6, [srcq+ r7*2] + movu m7, [srcq+ r9*2] + add srcq, ssq + mov myd, [stk+0x140] + mov dyd, dym + pshufd m15, m14, q1032 + pshufb m0, m14 ; 0a 1a + pshufb m1, m14 ; 0b 1b + pshufb m2, m15 ; 3a 2a + pshufb m3, m15 ; 3b 2b + pmaddwd m8, [stk+0x50] + pmaddwd m9, [stk+0x60] + pmaddwd m10, [stk+0x70] + pmaddwd m11, [stk+0x80] + pmaddwd m4, [stk+0x10] + pmaddwd m5, [stk+0x20] + pmaddwd m6, [stk+0x30] + pmaddwd m7, [stk+0x40] + phaddd m8, m9 + phaddd m10, m11 + mova m11, hround + phaddd m4, m5 + phaddd m6, m7 + phaddd m8, m10 + phaddd m4, m6 + paddd m4, m11 + paddd m8, m11 + psrad m4, m12 + psrad m8, m12 + packssdw m4, m8 + pshufb m5, [stk+0x90], m14 ; 4a 5a + pshufb m6, [stk+0xa0], m14 ; 4b 5b + pshufb m7, [stk+0xb0], m15 ; 7a 6a + pshufb m8, [stk+0xc0], m15 ; 7b 6b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + punpckhwd m5, m7 ; 56a + punpckhwd m6, m8 ; 56b + punpcklwd m7, m4 ; 78a + punpckhqdq m4, m4 + punpcklwd m8, m4 ; 78b + mova [stk+0x90], m5 + mova [stk+0xa0], m6 + mova [stk+0xb0], m7 + mova [stk+0xc0], m8 + jmp .vloop +.skip_line: + MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11 + MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11 + mov myd, [stk+0x140] + mov dyd, dym + mova m0, m2 ; 01a + mova m1, m3 ; 01b + mova m2, [stk+0x90] ; 23a + mova m3, [stk+0xa0] ; 23b + mova m5, [stk+0xb0] ; 45a + mova m6, [stk+0xc0] ; 45b + punpcklwd m7, m4, m8 ; 67a + punpckhwd m4, m8 ; 67b + mova [stk+0x90], m5 + mova [stk+0xa0], m6 + mova [stk+0xb0], m7 + mova [stk+0xc0], m4 +%else + mov r0m, r0 + mov myd, mym + mov r3, r3m + add myd, dym + test myd, ~0x3ff + mov mym, myd + jnz .next_line + mova m0, [stk+0x20] + mova m1, [stk+0x30] + mova m2, [stk+0x40] + mova m3, [stk+0x50] + jmp .vloop +.next_line: + test myd, 0x400 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + jz .skip_line + MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 + mova m7, [base+unpckw] + pshufd m4, m7, q1032 + pshufb m0, [stk+0x20], m7 ; 0a 1a + pshufb m1, [stk+0x30], m7 ; 0b 1b + pshufb m2, [stk+0x40], m4 ; 3a 2a + pshufb m3, [stk+0x50], m4 ; 3b 2b + pshufb m5, [stk+0x60], m7 ; 4a 5a + pshufb m6, [stk+0x70], m7 ; 4b 5b + pshufb m7, [stk+0x80], m4 ; 7a 6a + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + punpckhwd m5, m7 ; 56a + mova [stk+0x60], m5 + pshufb m5, [stk+0x90], m4 ; 7b 6b + punpcklwd m7, [stk+0xe0] ; 78a + punpckhwd m6, m5 ; 56b + mova [stk+0x70], m6 + movq m6, [stk+0xe8] + mova [stk+0x80], m7 + punpcklwd m5, m6 + mov myd, mym + mova [stk+0x90], m5 + jmp .vloop +.skip_line: + MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 + MC_8TAP_SCALED_H 0xa0, 0 ; 9 + mova m7, [stk+0xe0] + mova m2, [stk+0x60] ; 23a + mova m3, [stk+0x70] ; 23b + mova m4, [stk+0x80] ; 45a + mova m5, [stk+0x90] ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova m0, [stk+0x40] ; 01a + mova m1, [stk+0x50] ; 01b + mov myd, mym + mova [stk+0x40], m2 + mova [stk+0x50], m3 + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova [stk+0x20], m0 + mova [stk+0x30], m1 +%endif + jmp .vloop +INIT_XMM ssse3 +.dy1: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.dy1_w2: + %if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d + %else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 + %define m11 [esp+0x00] + %define m12 [esp+0x10] + %define m13 [esp+0x20] + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 + mov r1, r1m + %endif + pxor m9, m9 + punpckldq m9, m8 + paddd m14, m9 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m9, [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + pshufd m15, m15, q0321 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_q] + mova m6, [base+spel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m2, m2 + pcmpeqd m8, m2 + psrld m14, 10 + paddd m14, m14 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [stk], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m15 m6 + %endif + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m7 + %if ARCH_X86_64 + pshufb m14, m5 + paddb m14, m6 + pand m9, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m9 + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + %else + pand m7, m5, [base+pd_0x4000] + pandn m5, m15 + por m5, m7 + %define m15 m5 + mov myd, mym + mov r5, [esp+0x1f4] + xor r3, r3 + shr myd, 6 + lea r5, [r5+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + mov [stk+0x20], r3 + mov r3, r3m + %endif + punpcklbw m15, m15 + psraw m15, 8 + REPX {pshufb x, m14}, m0, m1, m2, m3 + REPX {pmaddwd x, m15}, m0, m1, m2, m3 + %if ARCH_X86_64 + REPX {pshufb x, m14}, m4, m5, m6 + REPX {pmaddwd x, m15}, m4, m5, m6 + phaddd m0, m1 + phaddd m2, m3 + phaddd m4, m5 + phaddd m6, m6 + REPX {paddd x, m11}, m0, m2, m4, m6 + REPX {psrad x, m12}, m0, m2, m4, m6 + packssdw m0, m2 ; 0 1 2 3 + packssdw m4, m6 ; 4 5 6 + SWAP m1, m4 + movq m10, r4 + %else + mova [stk+0x10], m15 + phaddd m0, m1 + phaddd m2, m3 + movu m1, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + REPX {pshufb x, m14}, m1, m7, m6 + REPX {pmaddwd x, m15}, m1, m7, m6 + %define m14 [stk+0x00] + %define m15 [stk+0x10] + phaddd m1, m7 + phaddd m6, m6 + REPX {paddd x, m11}, m0, m2, m1, m6 + REPX {psrad x, m12}, m0, m2, m1, m6 + packssdw m0, m2 + packssdw m1, m6 + %define m8 m6 + %define m9 m4 + %define m10 m5 + movd m10, r4 + movd m9, [stk+0x20] + punpckldq m10, m9 + %endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + %if ARCH_X86_32 + mova [stk+0x50], m7 + mova [stk+0x60], m8 + mova [stk+0x70], m9 + mova [stk+0x80], m10 + %define m7 [stk+0x50] + %define m8 [stk+0x60] + %define m9 [stk+0x70] + %define m10 [stk+0x80] + %endif + palignr m2, m1, m0, 4 ; 1 2 3 4 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + pshufd m4, m1, q2121 ; 5 6 5 6 + punpcklwd m2, m1, m4 ; 45 56 + %if ARCH_X86_32 + mov r0, r0m + %endif +.dy1_w2_loop: + movu m1, [srcq+ssq*0] + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m3, m7 + mova m3, m0 + pmaddwd m0, m8 + pshufb m1, m14 + pshufb m6, m14 + pmaddwd m1, m15 + pmaddwd m6, m15 + phaddd m1, m6 + paddd m1, m11 + psrad m1, m12 + packssdw m1, m1 + paddd m5, m0 + mova m0, m2 + pmaddwd m2, m9 + paddd m5, m2 + palignr m2, m1, m4, 12 + punpcklwd m2, m1 ; 67 78 + pmaddwd m4, m2, m10 + paddd m5, m13 + paddd m5, m4 + pxor m6, m6 + mova m4, m1 + pshufd m1, m12, q1032 + psrad m5, m1 + packssdw m5, m5 + pmaxsw m5, m6 + pminsw m5, pxmaxm + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q1032 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +INIT_XMM ssse3 +.dy1_w4: +%if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %if isput + mova [rsp+0x30], m13 + %define vrnd_mem [rsp+0x30] + %define stk rsp+0x40 + %else + %define vrnd_mem [base+pd_m524256] + %define stk rsp+0x30 + %endif + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d +%else + %define m10 [base+pd_0x3ff] + %define m9 [base+pd_0x4000] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq r3 + %endif + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m9, [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + pshufd m7, m15, q1032 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r6d, m15 + movd r13d, m7 + mova m10, [base+bdct_lb_q+ 0] + mova m11, [base+bdct_lb_q+16] + movd m13, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+ r6*8+2] + movd m15, [base+subpel_filters+r11*8+2] + movd m4, [base+subpel_filters+r13*8+2] +%else + movd r0, m15 + movd r4, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd rX, m15 + movd r5, m7 + mova m5, [base+bdct_lb_q+ 0] + mova m6, [base+bdct_lb_q+16] + movd m1, [base+subpel_filters+r0*8+2] + movd m2, [base+subpel_filters+rX*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + SWAP m4, m7 + %if isprep + mov r3, r3m + %endif + %define m10 m5 + %define m11 m6 + %define m12 m1 + %define m13 m1 +%endif + psrld m14, 10 + paddd m14, m14 + punpckldq m13, m2 + punpckldq m15, m4 + punpcklqdq m13, m15 + pxor m2, m2 + pcmpeqd m0, m2 +%if ARCH_X86_64 + pand m9, m0 +%else + pand m2, m9, m0 + %define m9 m2 + SWAP m7, m4 +%endif + pandn m0, m13 +%if ARCH_X86_64 + SWAP m13, m0 +%else + %define m13 m0 +%endif + por m13, m9 + punpckhbw m15, m13, m13 + punpcklbw m13, m13 + psraw m15, 8 + psraw m13, 8 + pshufb m12, m14, m10 + pshufb m14, m11 + mova m10, [base+spel_s_shuf2] + movd r4d, m14 + shr r4d, 24 +%if ARCH_X86_32 + mova [stk+0x40], m13 + mova [stk+0x50], m15 + pxor m2, m2 +%endif + pshufb m7, m14, m2 + psubb m14, m7 + paddb m12, m10 + paddb m14, m10 +%if ARCH_X86_64 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu m7, [srcq+ssq*0] + movu m9, [srcq+ssq*1] + movu m8, [srcq+ssq*2] + movu m10, [srcq+ss3q ] + movu m1, [srcq+r4 ] + movu m3, [srcq+r6 ] + movu m2, [srcq+r11 ] + movu m4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m7, m9, m8, m10 + REPX {pmaddwd x, m13}, m7, m9, m8, m10 + REPX {pshufb x, m14}, m1, m3, m2, m4 + REPX {pmaddwd x, m15}, m1, m3, m2, m4 + mova m5, [rsp+0x10] + movd xm6, [rsp+0x20] + phaddd m7, m1 + phaddd m9, m3 + phaddd m8, m2 + phaddd m10, m4 + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + REPX {paddd x, m5}, m7, m9, m8, m10 + REPX {psrad x, xm6}, m7, m9, m8, m10 + packssdw m7, m9 ; 0 1 + packssdw m8, m10 ; 2 3 + movu m0, [srcq+r4 ] + movu m9, [srcq+r6 ] + movu m10, [srcq+r11 ] + add srcq, ss3q + REPX {pshufb x, m12}, m1, m2, m3 + REPX {pmaddwd x, m13}, m1, m2, m3 + REPX {pshufb x, m14}, m0, m9, m10 + REPX {pmaddwd x, m15}, m0, m9, m10 + phaddd m1, m0 + phaddd m2, m9 + phaddd m3, m10 + shr myd, 6 + mov r13d, 64 << 24 + lea myd, [t1+myq] + cmovnz r13q, [base+subpel_filters+myq*8] + REPX {paddd x, m5}, m1, m2, m3 + REPX {psrad x, xm6}, m1, m2, m3 + packssdw m1, m2 ; 4 5 + packssdw m3, m3 ; 6 6 + SWAP m9, m1 + shufps m4, m7, m8, q1032 ; 1 2 + shufps m5, m8, m9, q1032 ; 3 4 + shufps m6, m9, m3, q1032 ; 5 6 + punpcklwd m0, m7, m4 ; 01 + punpckhwd m7, m4 ; 12 + punpcklwd m1, m8, m5 ; 23 + punpckhwd m8, m5 ; 34 + punpcklwd m2, m9, m6 ; 45 + punpckhwd m9, m6 ; 56 + movq m10, r13 + mova [stk+0x00], m1 + mova [stk+0x10], m8 + mova [stk+0x20], m2 + mova [stk+0x30], m9 + mova [stk+0x40], m3 + %define hrnd_mem [rsp+0x10] + %define hsh_mem [rsp+0x20] + %define vsh_mem [rsp+0x28] + %if isput + %define vrnd_mem [rsp+0x30] + %else + %define vrnd_mem [base+pd_m524256] + %endif +%else + mova [stk+0x20], m12 + mova [stk+0x30], m14 + add r4, srcq + MC_4TAP_SCALED_H 0x60 ; 0 1 + MC_4TAP_SCALED_H 0x70 ; 2 3 + MC_4TAP_SCALED_H 0x80 ; 4 5 + movu m7, [srcq] + movu m2, [r4] + add srcq, ssq + add r4, ssq + mov [stk+0xb0], r4 + pshufb m7, m12 + pshufb m2, m14 + pmaddwd m7, m13 + pmaddwd m2, m15 + phaddd m7, m2 + paddd m7, [esp+0x00] + psrad m7, [esp+0x10] + packssdw m7, m7 ; 6 6 + mova m4, [stk+0x60] + mova m5, [stk+0x70] + mova m6, [stk+0x80] + mov myd, mym + mov rX, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea rX, [rX+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+rX*8+0] + cmovnz r5, [base+subpel_filters+rX*8+4] + mov r3, r3m + shufps m1, m4, m5, q1032 ; 1 2 + shufps m2, m5, m6, q1032 ; 3 4 + shufps m3, m6, m7, q1032 ; 5 6 + mova [stk+0xa0], m7 + punpcklwd m0, m4, m1 ; 01 + punpckhwd m4, m1 ; 12 + punpcklwd m1, m5, m2 ; 23 + punpckhwd m5, m2 ; 34 + punpcklwd m2, m6, m3 ; 45 + punpckhwd m6, m3 ; 56 + movd m7, r4 + movd m3, r5 + mov r0, r0m + %if isput + mov r1, r1m + %endif + mov r4, [stk+0xb0] + mova [stk+0xc0], m4 ; 12 + mova [stk+0x60], m1 ; 23 + mova [stk+0x70], m2 ; 45 + mova [stk+0x80], m5 ; 34 + mova [stk+0x90], m6 ; 56 + %define m12 [stk+0x20] + %define m14 [stk+0x30] + %define m13 [stk+0x40] + %define m15 [stk+0x50] + %define hrnd_mem [esp+0x00] + %define hsh_mem [esp+0x10] + %define vsh_mem [esp+0x18] + %if isput + %define vrnd_mem [esp+0x20] + %else + %define vrnd_mem [base+pd_m524256] + %endif + %define m10 m7 + punpckldq m10, m3 +%endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m3, m10, q0000 + pshufd m4, m10, q1111 + pshufd m5, m10, q2222 + pshufd m10, m10, q3333 +%if ARCH_X86_32 + %xdefine m8 m3 + %xdefine m9 m6 + %xdefine m11 m5 + %xdefine m6 m4 + mova [stk+0x100], m3 + mova [stk+0x110], m4 + mova [stk+0x120], m5 + mova [stk+0x130], m10 + %define m3 [stk+0x100] + %define m4 [stk+0x110] + %define m5 [stk+0x120] + %define m10 [stk+0x130] + mova m7, [stk+0xc0] + mova m8, [stk+0x80] +%endif +.dy1_w4_loop: + movu m11, [srcq+ssq*0] + movu m6, [srcq+ssq*1] + pmaddwd m0, m3 + pmaddwd m7, m3 + pmaddwd m1, m4 + pmaddwd m8, m4 + pmaddwd m2, m5 + pmaddwd m9, m5 + paddd m1, m0 + paddd m8, m7 +%if ARCH_X86_64 + movu m0, [srcq+r4] + movu m7, [srcq+r6] +%else + movu m0, [r4+ssq*0] + movu m7, [r4+ssq*1] + lea r4, [r4+ssq*2] +%endif + lea srcq, [srcq+ssq*2] + paddd m1, m2 + paddd m8, m9 + pshufb m11, m12 + pshufb m6, m12 + pmaddwd m11, m13 + pmaddwd m6, m13 + pshufb m0, m14 + pshufb m7, m14 + pmaddwd m0, m15 + pmaddwd m7, m15 + phaddd m11, m0 + phaddd m6, m7 + paddd m11, hrnd_mem + paddd m6, hrnd_mem + psrad m11, hsh_mem + psrad m6, hsh_mem + packssdw m11, m6 ; 7 8 +%if ARCH_X86_64 + shufps m9, [stk+0x40], m11, q1032 ; 6 7 + mova m0, [stk+0x00] + mova [stk+0x40], m11 +%else + shufps m9, [stk+0xa0], m11, q1032 ; 6 7 + mova m0, [stk+0x60] + mova [stk+0xa0], m11 +%endif + punpcklwd m2, m9, m11 ; 67 + punpckhwd m9, m11 ; 78 + pmaddwd m6, m2, m10 + pmaddwd m7, m9, m10 +%if isput + movd m11, vsh_mem +%endif + paddd m1, vrnd_mem + paddd m8, vrnd_mem + paddd m1, m6 + paddd m8, m7 +%if ARCH_X86_64 + mova m7, [stk+0x10] +%else + mova m7, [stk+0x80] +%endif +%if isput + psrad m1, m11 + psrad m8, m11 +%else + psrad m1, 6 + psrad m8, 6 +%endif + packssdw m1, m8 +%if ARCH_X86_64 + mova m8, [stk+0x30] +%else + mova m8, [stk+0x90] +%endif +%if isput + pxor m6, m6 + pmaxsw m1, m6 + pminsw m1, pxmaxm + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], m1 + add tmpq, 16 +%endif +%if ARCH_X86_64 + mova m1, [stk+0x20] + mova [stk+0x10], m8 + mova [stk+0x00], m1 + mova [stk+0x20], m2 + mova [stk+0x30], m9 +%else + mova m1, [stk+0x70] + mova [stk+0x80], m8 + mova [stk+0x60], m1 + mova [stk+0x70], m2 + mova [stk+0x90], m9 +%endif + sub hd, 2 + jg .dy1_w4_loop + MC_8TAP_SCALED_RET ; why not jz .ret? +INIT_XMM ssse3 +.dy1_w8: + mov dword [stk+0xf0], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [stk+0xf0], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [stk+0xf0], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [stk+0xf0], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [stk+0xf0], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + mov myd, mym +%if ARCH_X86_64 + %ifidn %1, put + movifnidn dsm, dsq + %endif + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %define hround m11 + %if isput + mova [rsp+0x30], m13 + %else + mova m13, [base+pd_m524256] + %endif + shr t0d, 16 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d +%else + %define hround [esp+0x00] + %define m12 [esp+0x10] + %define m10 [base+pd_0x3ff] + %define m8 m0 + %xdefine m14 m4 + %xdefine m15 m3 + %if isprep + %define ssq ssm + %endif + mov r5, [esp+0x1f0] + mov r3, [esp+0x1f4] + shr r5, 16 + movd m15, r5 + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r0, r0m + mov r3, r3m +%endif + sub srcq, 6 + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] +%if ARCH_X86_64 + movq m3, r4q +%else + movd m5, r4 + movd m6, r5 + punpckldq m5, m6 + SWAP m3, m5 +%endif + punpcklbw m3, m3 + psraw m3, 8 + mova [stk+0x100], m7 + mova [stk+0x120], m15 + mov [stk+0x0f8], srcq + mov [stk+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 +%if ARCH_X86_64 + mova [stk+0x140], m0 + mova [stk+0x150], m1 + mova [stk+0x160], m2 + mova [stk+0x170], m3 + %if UNIX64 + mov hm, hd + %endif +%else + mova [stk+0x180], m0 + mova [stk+0x190], m1 + mova [stk+0x1a0], m2 + mova [stk+0x1b0], m3 + SWAP m5, m3 + mov r5, hm + mov [stk+0x134], r5 +%endif + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [stk+0x0f0] + jz .ret +%if ARCH_X86_64 + add qword [stk+0x130], 16 + mov hd, hm +%else + add dword [stk+0x130], 16 + mov r5, [stk+0x134] + mov r0, [stk+0x130] +%endif + mova m7, [stk+0x100] + mova m14, [stk+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] + mova m11, [rsp+0x10] +%endif + mova m15, [stk+0x120] + mov srcq, [stk+0x0f8] +%if ARCH_X86_64 + mov r0q, [stk+0x130] ; dstq / tmpq +%else + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.dy1_hloop: +%if ARCH_X86_64 + mova m9, [base+pq_0x40000000] +%else + %define m9 [base+pq_0x40000000] +%endif + pxor m1, m1 + psrld m2, m14, 10 + mova [stk], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m1 + pshufd m2, m5, q1032 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pxor m2, m2 + pcmpeqd m5, m2 + mova [stk+0x110], m14 + pshufd m4, m15, q1032 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + movq r11, m14 + punpckhqdq m14, m14 + movq rX, m14 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m9, m4 + pand m8, m9, m6 + pand m15, m9, m14 + pand m9, m9, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m9, m5 + punpcklbw m0, m7, m7 + punpckhbw m7, m7 + punpcklbw m1, m8, m8 + punpckhbw m8, m8 + psraw m0, 8 + psraw m7, 8 + psraw m1, 8 + psraw m8, 8 + punpcklbw m2, m15, m15 + punpckhbw m15, m15 + punpcklbw m3, m9, m9 + punpckhbw m9, m9 + psraw m2, 8 + psraw m15, 8 + psraw m3, 8 + psraw m9, 8 + mova [stk+0x10], m0 + mova [stk+0x20], m7 + mova [stk+0x30], m1 + mova [stk+0x40], m8 + mova [stk+0x50], m2 + mova [stk+0x60], m15 + mova [stk+0x70], m3 + mova [stk+0x80], m9 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 + mova [stk+0x90], m1 + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 + mova [stk+0xa0], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 + mova [stk+0xb0], m3 + MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 + mova [stk+0xc0], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 + mova [stk+0xd0], m5 + MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 + MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 + MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 + mova m5, [stk+0xd0] + mova m1, [stk+0x90] + mova m2, [stk+0xa0] + mova m3, [stk+0xb0] + mova m9, [stk+0xc0] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova m10, [stk+0x140] + mova m11, [stk+0x150] + mova m14, [stk+0x160] + mova m15, [stk+0x170] + mova [stk+0x90], m4 + mova [stk+0xa0], m5 + mova [stk+0xb0], m6 + mova [stk+0xc0], m7 + %define hround [rsp+0x10] + %define shift [rsp+0x20] + %if isput + %define vround [rsp+0x30] + %else + %define vround [base+pd_m524256] + %endif +.dy1_vloop: + pmaddwd m4, m0, m10 + pmaddwd m5, m1, m10 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m11 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [stk+0x90], m14 + pmaddwd m7, [stk+0xa0], m14 + pmaddwd m8, [stk+0xb0], m15 + pmaddwd m9, [stk+0xc0], m15 + paddd m4, m6 + paddd m5, m7 + %if isput + pshufd m6, m12, q1032 + %endif + paddd m4, m8 + paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r4, m15 + movd r5, m4 + mova m14, [stk+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [stk+16], m14 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m9, m4 + pand m1, m9, m6 + pand m2, m9, m7 + pand m3, m9, m5 + pandn m4, [stk+0x20] + pandn m6, [stk+0x30] + pandn m7, [stk+0x40] + pandn m5, [stk+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + punpcklbw m4, m0, m0 + punpckhbw m0, m0 + punpcklbw m5, m1, m1 + punpckhbw m1, m1 + psraw m4, 8 + psraw m0, 8 + psraw m5, 8 + psraw m1, 8 + punpcklbw m6, m2, m2 + punpckhbw m2, m2 + punpcklbw m7, m3, m3 + punpckhbw m3, m3 + psraw m6, 8 + psraw m2, 8 + psraw m7, 8 + psraw m3, 8 + mova [stk+0x0a0], m4 + mova [stk+0x0b0], m0 + mova [stk+0x0c0], m5 + mova [stk+0x0d0], m1 + mova [stk+0x140], m6 + mova [stk+0x150], m2 + mova [stk+0x160], m7 + mova [stk+0x170], m3 + MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 + MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 + MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 + MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 + MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 + MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 + MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 + MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 + mova m5, [stk+0x60] + mova m6, [stk+0x70] + mova m7, [stk+0x80] + mova m0, [stk+0x90] + mov r0, r0m + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m1, [stk+0x20] + mova m2, [stk+0x30] + mova m3, [stk+0x40] + mova m4, [stk+0x50] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova m4, [stk+0x180] + mova m5, [stk+0x190] + mova m6, [stk+0x1a0] + mova m7, [stk+0x1b0] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 +.dy1_vloop: + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [stk+0x60], m6 + pmaddwd m3, [stk+0x70], m6 + pmaddwd m4, [stk+0x80], m7 + pmaddwd m5, [stk+0x90], m7 + %if isput + movd m6, [esp+0x18] + %endif + paddd m0, m2 + paddd m1, m3 + paddd m0, vrnd_mem + paddd m1, vrnd_mem + paddd m4, m0 + paddd m5, m1 +%endif +%ifidn %1, put + psrad m4, m6 + psrad m5, m6 + packssdw m4, m5 + pxor m7, m7 + pmaxsw m4, m7 + pminsw m4, pxmaxm + mova [dstq], m4 + add dstq, dsm +%else + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep +%if ARCH_X86_64 + movu m8, [srcq+r10*2] + movu m9, [srcq+r11*2] + movu m12, [srcq+r13*2] + movu m13, [srcq+ rX*2] + movu m4, [srcq+ r4*2] + movu m5, [srcq+ r6*2] + movu m6, [srcq+ r7*2] + movu m7, [srcq+ r9*2] + add srcq, ssq + pmaddwd m8, [stk+0x50] + pmaddwd m9, [stk+0x60] + pmaddwd m12, [stk+0x70] + pmaddwd m13, [stk+0x80] + pmaddwd m4, [stk+0x10] + pmaddwd m5, [stk+0x20] + pmaddwd m6, [stk+0x30] + pmaddwd m7, [stk+0x40] + phaddd m8, m9 + phaddd m12, m13 + mova m9, [base+unpckw] + mova m13, hround + phaddd m4, m5 + phaddd m6, m7 + phaddd m8, m12 + phaddd m4, m6 + pshufd m5, m9, q1032 + pshufb m0, m9 ; 0a 1a + pshufb m1, m9 ; 0b 1b + pshufb m2, m5 ; 3a 2a + pshufb m3, m5 ; 3b 2b + mova m12, shift + paddd m4, m13 + paddd m8, m13 + psrad m4, m12 + psrad m8, m12 + packssdw m4, m8 + pshufb m6, [stk+0x90], m9 ; 4a 5a + pshufb m7, [stk+0xa0], m9 ; 4b 5b + pshufb m8, [stk+0xb0], m5 ; 7a 6a + pshufb m13, [stk+0xc0], m5 ; 7b 6b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m6 ; 34a + punpcklwd m3, m7 ; 34b + punpckhwd m6, m8 ; 56a + punpckhwd m7, m13 ; 56b + punpcklwd m8, m4 ; 78a + punpckhqdq m4, m4 + punpcklwd m13, m4 ; 78b + mova [stk+0x90], m6 + mova [stk+0xa0], m7 + mova [stk+0xb0], m8 + mova [stk+0xc0], m13 + mova m13, vround +%else + mov r0m, r0 + mov r3, r3m + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 + mova m7, [base+unpckw] + pshufd m4, m7, q1032 + pshufb m0, [stk+0x20], m7 ; 0a 1a + pshufb m1, [stk+0x30], m7 ; 0b 1b + pshufb m2, [stk+0x40], m4 ; 3a 2a + pshufb m3, [stk+0x50], m4 ; 3b 2b + pshufb m5, [stk+0x60], m7 ; 4a 5a + pshufb m6, [stk+0x70], m7 ; 4b 5b + pshufb m7, [stk+0x80], m4 ; 7a 6a + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + punpckhwd m5, m7 ; 56a + mova [stk+0x60], m5 + pshufb m5, [stk+0x90], m4 ; 7b 6b + punpcklwd m7, [stk+0xe0] ; 78a + mova m4, [stk+0x180] + punpckhwd m6, m5 ; 56b + mova [stk+0x70], m6 + movq m6, [stk+0xe8] + mova [stk+0x80], m7 + mova m7, [stk+0x1b0] + punpcklwd m5, m6 + mova m6, [stk+0x1a0] + mova [stk+0x90], m5 + mova m5, [stk+0x190] + mov r0, r0m +%endif + jmp .dy1_vloop +INIT_XMM ssse3 +%if ARCH_X86_64 + %define stk rsp+0x20 +%endif +.dy2: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] + add wq, base_reg + jmp wq +%if isput +.dy2_w2: + %if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m13 + %define vrnd_mem [rsp+0x10] + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d + %else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 + %define m11 [esp+0x00] + %define m12 [esp+0x10] + %define vrnd_mem [esp+0x20] + mov r1, r1m + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 + %endif + pxor m9, m9 + punpckldq m9, m8 + paddd m14, m9 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m9, [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + pshufd m15, m15, q0321 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_q] + mova m6, [base+spel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m2, m2 + pcmpeqd m8, m2 + psrld m14, 10 + paddd m14, m14 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [stk], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m15 m6 + %endif + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*2] + movu m2, [srcq+ssq*4] + punpckldq m15, m7 + %if ARCH_X86_64 + pshufb m14, m5 + paddb m14, m6 + pand m9, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m9 + movu m4, [srcq+ssq*1] + movu m5, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + %else + pand m7, m5, [base+pd_0x4000] + pandn m5, m15 + por m5, m7 + %define m15 m5 + mov myd, mym + mov r5, [esp+0x1f4] + xor r3, r3 + shr myd, 6 + lea r5, [r5+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + mov [stk+0x20], r3 + mov r3, r3m + %endif + punpcklbw m15, m15 + psraw m15, 8 + REPX {pshufb x, m14}, m0, m1, m2 + REPX {pmaddwd x, m15}, m0, m1, m2 + %if ARCH_X86_64 + REPX {pshufb x, m14}, m4, m5, m6 + REPX {pmaddwd x, m15}, m4, m5, m6 + phaddd m0, m1 + phaddd m1, m2 + phaddd m4, m5 + phaddd m5, m6 + REPX {paddd x, m11}, m0, m1, m4, m5 + REPX {psrad x, m12}, m0, m1, m4, m5 + packssdw m0, m1 ; 0 2 2 4 + packssdw m4, m5 ; 1 3 3 5 + SWAP m2, m4 + movq m10, r4 + %else + mova [stk+0x10], m15 + phaddd m0, m1 + phaddd m1, m2 + movu m2, [srcq+ssq*1] + movu m7, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + movu m6, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + REPX {pshufb x, m14}, m2, m7, m6 + REPX {pmaddwd x, m15}, m2, m7, m6 + %define m14 [stk+0x00] + %define m15 [stk+0x10] + phaddd m2, m7 + phaddd m7, m6 + REPX {paddd x, m11}, m0, m1, m2, m7 + REPX {psrad x, m12}, m0, m1, m2, m7 + packssdw m0, m1 + packssdw m2, m7 + %define m8 m6 + %define m9 m4 + %define m10 m5 + movd m10, r4 + movd m9, [stk+0x20] + punpckldq m10, m9 + %endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + %if ARCH_X86_32 + mova [stk+0x50], m7 + mova [stk+0x60], m8 + mova [stk+0x70], m9 + mova [stk+0x80], m10 + %xdefine m13 m7 + %define m7 [stk+0x50] + %define m8 [stk+0x60] + %define m9 [stk+0x70] + %define m10 [stk+0x80] + %endif + punpcklwd m1, m0, m2 ; 01 23 + punpckhwd m3, m0, m2 ; 23 45 + %if ARCH_X86_32 + mov r4, r0m + %define dstq r4 + mova [stk+0x20], m3 + mova [stk+0x30], m0 + %endif +.dy2_w2_loop: + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + movu m13, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd m3, m8 + REPX {pshufb x, m14}, m4, m5, m6, m13 + REPX {pmaddwd x, m15}, m4, m5, m6, m13 + phaddd m4, m5 + phaddd m6, m13 + pmaddwd m5, m1, m7 + paddd m4, m11 + paddd m6, m11 + psrad m4, m12 + psrad m6, m12 + packssdw m4, m6 ; 6 7 8 9 + paddd m5, m3 + pshufd m3, m4, q2200 + pshufd m4, m4, q3311 + palignr m3, m0, 12 ; 4 6 6 8 + palignr m4, m2, 12 ; 5 7 7 9 + mova m0, m3 + mova m2, m4 + punpcklwd m1, m3, m4 + punpckhwd m3, m4 + pmaddwd m6, m1, m9 + pmaddwd m4, m3, m10 + paddd m5, vrnd_mem + paddd m6, m4 + paddd m5, m6 + pshufd m4, m12, q1032 + pxor m6, m6 + psrad m5, m4 + packssdw m5, m5 + pmaxsw m5, m6 + pminsw m5, pxmaxm + movd [dstq+dsq*0], m5 + pshuflw m5, m5, q1032 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +INIT_XMM ssse3 +.dy2_w4: +%if ARCH_X86_64 + mov myd, mym + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %if isput + mova [rsp+0x30], m13 + %define vrnd_mem [rsp+0x30] + %define stk rsp+0x40 + %else + %define vrnd_mem [base+pd_m524256] + %define stk rsp+0x30 + %endif + movzx t0d, t0b + sub srcq, 2 + movd m15, t0d +%else + %define m10 [base+pd_0x3ff] + %define m9 [base+pd_0x4000] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq r3 + %endif + movzx r5, byte [esp+0x1f0] + sub srcq, 2 + movd m15, r5 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m9, [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + pshufd m7, m15, q1032 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r6d, m15 + movd r13d, m7 + mova m10, [base+bdct_lb_q+ 0] + mova m11, [base+bdct_lb_q+16] + movd m13, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+ r6*8+2] + movd m15, [base+subpel_filters+r11*8+2] + movd m4, [base+subpel_filters+r13*8+2] +%else + movd r1, m15 + movd r4, m7 + pshufd m15, m15, q0321 + pshufd m7, m7, q0321 + movd r3, m15 + movd r5, m7 + mova m5, [base+bdct_lb_q+ 0] + mova m6, [base+bdct_lb_q+16] + movd m1, [base+subpel_filters+r1*8+2] + movd m2, [base+subpel_filters+r3*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + SWAP m4, m7 + mov r3, r3m + %if isprep + lea ss3q, [ssq*3] + %endif + %define m10 m5 + %define m11 m6 + %define m12 m1 + %define m13 m1 +%endif + psrld m14, 10 + paddd m14, m14 + punpckldq m13, m2 + punpckldq m15, m4 + punpcklqdq m13, m15 + pxor m2, m2 + pcmpeqd m0, m2 +%if ARCH_X86_64 + pand m9, m0 +%else + pand m2, m9, m0 + %define m9 m2 + SWAP m7, m4 +%endif + pandn m0, m13 +%if ARCH_X86_64 + SWAP m13, m0 +%else + %define m13 m0 +%endif + por m13, m9 + punpckhbw m15, m13, m13 + punpcklbw m13, m13 + psraw m15, 8 + psraw m13, 8 + pshufb m12, m14, m10 + pshufb m14, m11 + mova m10, [base+spel_s_shuf2] + movd r4d, m14 + shr r4d, 24 +%if ARCH_X86_32 + mova [stk+0x40], m13 + mova [stk+0x50], m15 + pxor m2, m2 +%endif + pshufb m7, m14, m2 + psubb m14, m7 + paddb m12, m10 + paddb m14, m10 +%if ARCH_X86_64 + lea r6, [r4+ssq*1] + lea r11, [r4+ssq*2] + lea r13, [r4+ss3q ] + movu m1, [srcq+ssq*0] + movu m8, [srcq+ssq*2] + movu m9, [srcq+ssq*1] + movu m10, [srcq+ss3q ] + movu m7, [srcq+r4 ] + movu m2, [srcq+r11 ] + movu m3, [srcq+r6 ] + movu m4, [srcq+r13 ] + lea srcq, [srcq+ssq*4] + REPX {pshufb x, m12}, m1, m9, m8, m10 + REPX {pmaddwd x, m13}, m1, m9, m8, m10 + REPX {pshufb x, m14}, m7, m3, m2, m4 + REPX {pmaddwd x, m15}, m7, m3, m2, m4 + mova m5, [rsp+0x10] + movd xm6, [rsp+0x20] + phaddd m1, m7 + phaddd m8, m2 + phaddd m9, m3 + phaddd m10, m4 + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + REPX {paddd x, m5}, m1, m9, m8, m10 + REPX {psrad x, xm6}, m1, m9, m8, m10 + packssdw m1, m8 ; 0 2 + packssdw m9, m10 ; 1 3 + movu m0, [srcq+r4 ] + movu m8, [srcq+r6 ] + lea srcq, [srcq+ssq*2] + REPX {pshufb x, m12}, m2, m3 + REPX {pmaddwd x, m13}, m2, m3 + REPX {pshufb x, m14}, m0, m8 + REPX {pmaddwd x, m15}, m0, m8 + phaddd m2, m0 + phaddd m3, m8 + shr myd, 6 + mov r9d, 64 << 24 + lea myd, [t1+myq] + cmovnz r9q, [base+subpel_filters+myq*8] + REPX {paddd x, m5}, m2, m3 + REPX {psrad x, xm6}, m2, m3 + packssdw m2, m3 ; 4 5 + pshufd m3, m2, q1032 ; 5 _ + punpcklwd m0, m1, m9 ; 01 + punpckhwd m1, m9 ; 23 + punpcklwd m2, m3 ; 45 + movq m10, r9 + %define hrnd_mem [rsp+0x10] + %define hsh_mem [rsp+0x20] + %define vsh_mem [rsp+0x28] + %if isput + %define vrnd_mem [rsp+0x30] + %else + %define vrnd_mem [base+pd_m524256] + %endif +%else + mova [stk+0x20], m12 + mova [stk+0x30], m14 + add r4, srcq + MC_4TAP_SCALED_H 0x60 ; 0 1 + MC_4TAP_SCALED_H 0x70 ; 2 3 + MC_4TAP_SCALED_H 0x80 ; 4 5 + mov [stk+0xe0], r4 + mova m3, [base+spel_s_shuf8] + mova m0, [stk+0x60] + mova m1, [stk+0x70] + mova m2, [stk+0x80] + mov myd, mym + mov rX, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea rX, [rX+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+rX*8+0] + cmovnz r5, [base+subpel_filters+rX*8+4] + mov r3, r3m + pshufb m0, m3 ; 01 + pshufb m1, m3 ; 23 + pshufb m2, m3 ; 45 + movd m7, r4 + movd m4, r5 + mov r5, r0m + %if isput + mov r1, r1m + %endif + mov r4, [stk+0xe0] + %define dstq r5 + %define tmpq r5 + %define m12 [stk+0x20] + %define m14 [stk+0x30] + %define m13 [stk+0x40] + %define m15 [stk+0x50] + %define hrnd_mem [esp+0x00] + %define hsh_mem [esp+0x10] + %define vsh_mem [esp+0x18] + %if isput + %define vrnd_mem [esp+0x20] + %else + %define vrnd_mem [base+pd_m524256] + %endif + %define m10 m7 + punpckldq m10, m4 +%endif + punpcklbw m10, m10 + psraw m10, 8 + pshufd m3, m10, q0000 + pshufd m4, m10, q1111 + pshufd m5, m10, q2222 + pshufd m10, m10, q3333 +%if ARCH_X86_32 + %xdefine m8 m3 + %xdefine m9 m6 + %xdefine m11 m5 + %xdefine m6 m4 + mova [stk+0x100], m3 + mova [stk+0x110], m4 + mova [stk+0x120], m5 + mova [stk+0x130], m10 + %define m3 [stk+0x100] + %define m4 [stk+0x110] + %define m5 [stk+0x120] + %define m10 [stk+0x130] +%endif +.dy2_w4_loop: + pmaddwd m8, m0, m3 + pmaddwd m9, m1, m3 + mova m0, m2 + pmaddwd m1, m4 + pmaddwd m11, m2, m4 + paddd m8, vrnd_mem + paddd m9, vrnd_mem + pmaddwd m2, m5 + paddd m8, m1 + paddd m9, m11 + paddd m8, m2 + movu m6, [srcq+ssq*0] + movu m1, [srcq+ssq*2] +%if ARCH_X86_64 + movu m11, [srcq+r4 ] + movu m2, [srcq+r11] +%else + movu m11, [r4+ssq*0] + movu m2, [r4+ssq*2] +%endif + pshufb m6, m12 + pshufb m1, m12 + pmaddwd m6, m13 + pmaddwd m1, m13 + pshufb m11, m14 + pshufb m2, m14 + pmaddwd m11, m15 + pmaddwd m2, m15 + phaddd m6, m11 + phaddd m1, m2 + paddd m6, hrnd_mem + paddd m1, hrnd_mem + psrad m6, hsh_mem + psrad m1, hsh_mem + movu m7, [srcq+ssq*1] + movu m11, [srcq+ss3q ] + packssdw m6, m1 ; 6 8 +%if ARCH_X86_64 + movu m2, [srcq+r6 ] + movu m1, [srcq+r13] +%else + movu m2, [r4+ssq*1] + movu m1, [r4+ss3q ] +%endif + pshufb m7, m12 + pshufb m11, m12 + pmaddwd m7, m13 + pmaddwd m11, m13 + pshufb m2, m14 + pshufb m1, m14 + pmaddwd m2, m15 + pmaddwd m1, m15 + phaddd m7, m2 + phaddd m11, m1 + paddd m7, hrnd_mem + paddd m11, hrnd_mem + psrad m7, hsh_mem + psrad m11, hsh_mem + packssdw m7, m11 ; 7 9 +%if ARCH_X86_32 + lea r4, [r4+ssq*4] +%endif + lea srcq, [srcq+ssq*4] + punpcklwd m1, m6, m7 ; 67 + punpckhwd m6, m7 ; 89 + mova m2, m6 + pmaddwd m11, m1, m5 + pmaddwd m7, m1, m10 + pmaddwd m6, m10 + paddd m9, m11 +%if isput + movd m11, vsh_mem +%endif + paddd m8, m7 + paddd m9, m6 +%if isput + psrad m8, m11 + psrad m9, m11 + packssdw m8, m9 + pxor m7, m7 + pmaxsw m8, m7 + pminsw m8, pxmaxm + movq [dstq+dsq*0], m8 + movhps [dstq+dsq*1], m8 + lea dstq, [dstq+dsq*2] +%else + psrad m8, 6 + psrad m9, 6 + packssdw m8, m9 + mova [tmpq], m8 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET ; why not jz .ret? +INIT_XMM ssse3 +.dy2_w8: + mov dword [stk+0xf0], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [stk+0xf0], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [stk+0xf0], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [stk+0xf0], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [stk+0xf0], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + mov myd, mym +%if ARCH_X86_64 + %ifidn %1, put + movifnidn dsm, dsq + %endif + mova [rsp+0x10], m11 + mova [rsp+0x20], m12 + %define hround m11 + %if isput + mova [rsp+0x30], m13 + %else + mova m13, [base+pd_m524256] + %endif + shr t0d, 16 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d +%else + %define hround [esp+0x00] + %define m12 [esp+0x10] + %define m10 [base+pd_0x3ff] + %define m8 m0 + %xdefine m14 m4 + %xdefine m15 m3 + %if isput + %define dstq r0 + %else + %define tmpq r0 + %define ssq ssm + %endif + mov r5, [esp+0x1f0] + mov r3, [esp+0x1f4] + shr r5, 16 + movd m15, r5 + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r0, r0m + mov r3, r3m +%endif + sub srcq, 6 + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] +%if ARCH_X86_64 + movq m3, r4q +%else + movd m5, r4 + movd m6, r5 + punpckldq m5, m6 + SWAP m3, m5 +%endif + punpcklbw m3, m3 + psraw m3, 8 + mova [stk+0x100], m7 + mova [stk+0x120], m15 + mov [stk+0x0f8], srcq + mov [stk+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 +%if ARCH_X86_64 + mova [stk+0x140], m0 + mova [stk+0x150], m1 + mova [stk+0x160], m2 + mova [stk+0x170], m3 + %if UNIX64 + mov hm, hd + %endif +%else + mova [stk+0x180], m0 + mova [stk+0x190], m1 + mova [stk+0x1a0], m2 + mova [stk+0x1b0], m3 + SWAP m5, m3 + mov r5, hm + mov [stk+0x134], r5 +%endif + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [stk+0x0f0] + jz .ret +%if ARCH_X86_64 + add qword [stk+0x130], 16 + mov hd, hm +%else + add dword [stk+0x130], 16 + mov r5, [stk+0x134] + mov r0, [stk+0x130] +%endif + mova m7, [stk+0x100] + mova m14, [stk+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] + mova m11, [rsp+0x10] +%endif + mova m15, [stk+0x120] + mov srcq, [stk+0x0f8] +%if ARCH_X86_64 + mov r0q, [stk+0x130] ; dstq / tmpq +%else + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.dy2_hloop: +%if ARCH_X86_64 + mova m9, [base+pq_0x40000000] +%else + %define m9 [base+pq_0x40000000] +%endif + pxor m1, m1 + psrld m2, m14, 10 + mova [stk], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m1 + pshufd m2, m5, q1032 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + pshufd m5, m5, q0321 + pshufd m2, m2, q0321 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pxor m2, m2 + pcmpeqd m5, m2 + mova [stk+0x110], m14 + pshufd m4, m15, q1032 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + movq r11, m14 + punpckhqdq m14, m14 + movq rX, m14 + mov r10d, r11d + shr r11, 32 + mov r13d, rXd + shr rX, 32 + mov r4d, [stk+ 0] + mov r6d, [stk+ 4] + mov r7d, [stk+ 8] + mov r9d, [stk+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m9, m4 + pand m8, m9, m6 + pand m15, m9, m14 + pand m9, m9, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m9, m5 + punpcklbw m0, m7, m7 + punpckhbw m7, m7 + punpcklbw m1, m8, m8 + punpckhbw m8, m8 + psraw m0, 8 + psraw m7, 8 + psraw m1, 8 + psraw m8, 8 + punpcklbw m2, m15, m15 + punpckhbw m15, m15 + punpcklbw m3, m9, m9 + punpckhbw m9, m9 + psraw m2, 8 + psraw m15, 8 + psraw m3, 8 + psraw m9, 8 + mova [stk+0x10], m0 + mova [stk+0x20], m7 + mova [stk+0x30], m1 + mova [stk+0x40], m8 + mova [stk+0x50], m2 + mova [stk+0x60], m15 + mova [stk+0x70], m3 + mova [stk+0x80], m9 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 + mova [stk+0x90], m1 + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 + mova [stk+0xa0], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 + mova [stk+0xb0], m3 + MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 + mova [stk+0xc0], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 + mova [stk+0xd0], m5 + MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 + MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 + MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 + mova m5, [stk+0xd0] + mova m1, [stk+0x90] + mova m2, [stk+0xa0] + mova m3, [stk+0xb0] + mova m9, [stk+0xc0] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova m10, [stk+0x140] + mova m11, [stk+0x150] + mova m14, [stk+0x160] + mova m15, [stk+0x170] + mova [stk+0x90], m4 + mova [stk+0xa0], m5 + mova [stk+0xb0], m6 + mova [stk+0xc0], m7 + %define hround [rsp+0x10] + %define shift [rsp+0x20] + %if isput + %define vround [rsp+0x30] + %else + %define vround [base+pd_m524256] + %endif +.dy2_vloop: + pmaddwd m4, m0, m10 + pmaddwd m5, m1, m10 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m11 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [stk+0x90], m14 + pmaddwd m7, [stk+0xa0], m14 + pmaddwd m8, [stk+0xb0], m15 + pmaddwd m9, [stk+0xc0], m15 + paddd m4, m6 + paddd m5, m7 + %if isput + pshufd m6, m12, q1032 + %endif + paddd m4, m8 + paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + pshufd m15, m15, q0321 + pshufd m4, m4, q0321 + movd r4, m15 + movd r5, m4 + mova m14, [stk+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [stk+16], m14 + mov r0, [stk+ 0] + mov rX, [stk+ 4] + mov r4, [stk+ 8] + mov r5, [stk+12] + mova [stk+0x20], m0 + mova [stk+0x30], m1 + mova [stk+0x40], m2 + mova [stk+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m9, m4 + pand m1, m9, m6 + pand m2, m9, m7 + pand m3, m9, m5 + pandn m4, [stk+0x20] + pandn m6, [stk+0x30] + pandn m7, [stk+0x40] + pandn m5, [stk+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + punpcklbw m4, m0, m0 + punpckhbw m0, m0 + punpcklbw m5, m1, m1 + punpckhbw m1, m1 + psraw m4, 8 + psraw m0, 8 + psraw m5, 8 + psraw m1, 8 + punpcklbw m6, m2, m2 + punpckhbw m2, m2 + punpcklbw m7, m3, m3 + punpckhbw m3, m3 + psraw m6, 8 + psraw m2, 8 + psraw m7, 8 + psraw m3, 8 + mova [stk+0x0a0], m4 + mova [stk+0x0b0], m0 + mova [stk+0x0c0], m5 + mova [stk+0x0d0], m1 + mova [stk+0x140], m6 + mova [stk+0x150], m2 + mova [stk+0x160], m7 + mova [stk+0x170], m3 + MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 + MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 + MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 + MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 + MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 + MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 + MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 + MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 + mova m5, [stk+0x60] + mova m6, [stk+0x70] + mova m7, [stk+0x80] + mova m0, [stk+0x90] + mov r0, r0m + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m1, [stk+0x20] + mova m2, [stk+0x30] + mova m3, [stk+0x40] + mova m4, [stk+0x50] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova m4, [stk+0x180] + mova m5, [stk+0x190] + mova m6, [stk+0x1a0] + mova m7, [stk+0x1b0] + mova [stk+0x40], m2 + mova [stk+0x50], m3 +.dy2_vloop: + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [stk+0x60], m6 + pmaddwd m3, [stk+0x70], m6 + pmaddwd m4, [stk+0x80], m7 + pmaddwd m5, [stk+0x90], m7 + %if isput + movd m6, [esp+0x18] + %endif + paddd m0, m2 + paddd m1, m3 + paddd m0, vrnd_mem + paddd m1, vrnd_mem + paddd m4, m0 + paddd m5, m1 +%endif +%ifidn %1, put + psrad m4, m6 + psrad m5, m6 + packssdw m4, m5 + pxor m7, m7 + pmaxsw m4, m7 + pminsw m4, pxmaxm + mova [dstq], m4 + add dstq, dsm +%else + psrad m4, 6 + psrad m5, 6 + packssdw m4, m5 + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep +%if ARCH_X86_64 + MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1 + mova [stk+0xd0], m4 + MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1 + mova m4, [stk+0xd0] + mova m0, m2 ; 01a + mova m1, m3 ; 01b + mova m2, [stk+0x90] ; 23a + mova m3, [stk+0xa0] ; 23b + mova m5, [stk+0xb0] ; 45a + mova m6, [stk+0xc0] ; 45b + punpcklwd m7, m4, m8 ; 67a + punpckhwd m4, m8 ; 67b + mova [stk+0x90], m5 + mova [stk+0xa0], m6 + mova [stk+0xb0], m7 + mova [stk+0xc0], m4 +%else + mov r0m, r0 + mov r3, r3m + MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8 + MC_8TAP_SCALED_H 0xa0, 0 ; 9 + mova m7, [stk+0xe0] + mova m2, [stk+0x60] ; 23a + mova m3, [stk+0x70] ; 23b + mova m4, [stk+0x80] ; 45a + mova m5, [stk+0x90] ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova m0, [stk+0x40] ; 01a + mova m1, [stk+0x50] ; 01b + mova [stk+0x40], m2 + mova [stk+0x50], m3 + mova [stk+0x60], m4 + mova [stk+0x70], m5 + mova m4, [stk+0x180] + mova m5, [stk+0x190] + mova [stk+0x80], m6 + mova [stk+0x90], m7 + mova m6, [stk+0x1a0] + mova m7, [stk+0x1b0] + mov r0, r0m +%endif + jmp .dy2_vloop +INIT_XMM ssse3 +.ret: + MC_8TAP_SCALED_RET 0 +%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT + %define r0m [rstk+stack_offset+ 4] + %define r1m [rstk+stack_offset+ 8] + %define r2m [rstk+stack_offset+12] + %define r3m [rstk+stack_offset+16] +%endif +%undef isput +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled_16bpc + mov t0d, (5*15 << 16) | 5*15 + mov t1d, (5*15 << 16) | 5*15 + jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%elif ARCH_X86_64 +DECLARE_REG_TMP 6, 8 +%else +DECLARE_REG_TMP 1, 2 +%endif +BILIN_SCALED_FN put +FN put_8tap_scaled, sharp, SHARP, SHARP +FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN put_8tap_scaled, smooth, SMOOTH, SMOOTH +FN put_8tap_scaled, sharp_regular, SHARP, REGULAR +FN put_8tap_scaled, regular_sharp, REGULAR, SHARP +FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN put_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%elif ARCH_X86_64 +DECLARE_REG_TMP 6, 7 +%else +DECLARE_REG_TMP 1, 2 +%endif +BILIN_SCALED_FN prep +FN prep_8tap_scaled, sharp, SHARP, SHARP +FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH +FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR +FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP +FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN prep_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED prep + +%if ARCH_X86_64 +DECLARE_REG_TMP 6 +%else +DECLARE_REG_TMP 2 +%endif + +%if ARCH_X86_64 +; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that +; by allocating 16 bytes more stack space so that stack offsets match up. +%if WIN64 && STACK_ALIGNMENT == 16 +%assign stksz 16*14 +%else +%assign stksz 16*13 +%endif +cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ + mx, tmp, alpha, beta, \ + filter, my, gamma, cnt +%assign stack_size_padded_8x8t stack_size_padded +%else +cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ + filter, mx, my +%define m8 [esp+16*13] +%define m9 [esp+16*14] +%define cntd dword [esp+4*63] +%define dstq tmpq +%define dsq 0 +%if STACK_ALIGNMENT < 16 +%define dstm [esp+4*65] +%define dsm [esp+4*66] +%else +%define dstm r0m +%define dsm r1m +%endif +%endif +%define base filterq-$$ + mov t0d, r7m + LEA filterq, $$ + shr t0d, 11 +%if ARCH_X86_64 + movddup m8, [base+warp8x8t_rnd] +%else + movddup m1, [base+warp8x8t_rnd] + mov r1, r1m + add r1, r1 + mova m8, m1 + mov r1m, r1 ; ds *= 2 +%endif + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main + jmp .start +.loop: +%if ARCH_X86_64 + lea dstq, [dstq+dsq*4] +%else + add dstq, dsm + mov dstm, dstq +%endif + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 +.start: +%if ARCH_X86_32 + mov dstq, dstm +%endif + paddd m1, m8 + paddd m2, m8 + psrad m1, 15 + psrad m2, 15 + packssdw m1, m2 + mova [dstq+dsq*0], m1 + call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 +%if ARCH_X86_32 + mov dstq, dstm + add dstq, dsm +%endif + paddd m1, m8 + paddd m2, m8 + psrad m1, 15 + psrad m2, 15 + packssdw m1, m2 + mova [dstq+dsq*2], m1 + dec cntd + jg .loop + RET + +%if ARCH_X86_64 +cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ + mx, tmp, alpha, beta, \ + filter, my, gamma, cnt +ASSERT stack_size_padded == stack_size_padded_8x8t +%else +cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ + filter, mx, my +%endif + mov t0d, r7m + LEA filterq, $$ + shr t0d, 11 +%if ARCH_X86_64 + movddup m8, [base+warp8x8_rnd2+t0*8] + movd m9, r7m ; pixel_max + pshufb m9, [base+pw_256] +%else + movddup m1, [base+warp8x8_rnd2+t0*8] + movd m2, r7m ; pixel_max + pshufb m2, [base+pw_256] + mova m8, m1 + mova m9, m2 +%endif + call .main + jmp .start +.loop: +%if ARCH_X86_64 + lea dstq, [dstq+dsq*2] +%else + add dstq, dsm + mov dstm, dstq +%endif + call .main2 +.start: +%if ARCH_X86_32 + mov dstq, dstm +%endif + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + pmaxsw m1, m6 + pmulhrsw m1, m8 + pminsw m1, m9 + mova [dstq+dsq*0], m1 + call .main3 +%if ARCH_X86_32 + mov dstq, dstm + add dstq, dsm +%endif + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + pmaxsw m1, m6 + pmulhrsw m1, m8 + pminsw m1, m9 + mova [dstq+dsq*1], m1 + dec cntd + jg .loop + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov deltaq, r5m + mov mxd, r6m +%endif + movd m0, [base+warp8x8_shift+t0*4] + movddup m7, [base+warp8x8_rnd1+t0*8] + add filterq, mc_warp_filter-$$ +%if ARCH_X86_64 + movsx alphad, word [deltaq+2*0] + movsx betad, word [deltaq+2*1] + movsx gammad, word [deltaq+2*2] + movsx deltad, word [deltaq+2*3] + lea tmpq, [ssq*3] + add mxd, 512+(64<<10) + sub srcq, tmpq ; src -= ss*3 + imul tmpd, alphad, -7 + mov myd, r7m + add betad, tmpd ; beta -= alpha*7 + imul tmpd, gammad, -7 + add myd, 512+(64<<10) + mov cntd, 4 + add deltad, tmpd ; delta -= gamma*7 +%else +%if STACK_ALIGNMENT < 16 + %assign stack_offset stack_offset - gprsize +%endif + mov r3d, r5m ; abcd +%if STACK_ALIGNMENT < 16 + mov r0, r1m ; dst + mov r1, r2m ; ds + mov [esp+gprsize+4*65], r0 + mov [esp+gprsize+4*66], r1 +%endif + movsx alphad, word [r3+2*0] + movsx r2d, word [r3+2*1] + movsx gammad, word [r3+2*2] + movsx r3d, word [r3+2*3] + imul r5d, alphad, -7 + add r2d, r5d ; beta -= alpha*7 + imul r5d, gammad, -7 + mov [esp+gprsize+4*60], r2d + add r3d, r5d ; delta -= gamma*7 + mov [esp+gprsize+4*61], r3d + mov r3d, r4m ; ss + mov srcq, r3m + mov mxd, r6m + mov myd, r7m + mov dword [esp+gprsize+4*63], 4 ; cnt + mov [esp+gprsize+4*62], r3 + lea r3, [r3*3] + add mxd, 512+(64<<10) + add myd, 512+(64<<10) + sub srcq, r3 ; src -= ss*3 +%if STACK_ALIGNMENT < 16 + %assign stack_offset stack_offset + gprsize +%endif +%endif + mova [rsp+gprsize], m0 + pxor m6, m6 + call .h + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 01 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 1], m1 + mova [rsp+gprsize+16* 4], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 12 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 7], m1 + mova [rsp+gprsize+16*10], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 23 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 2], m1 + mova [rsp+gprsize+16* 5], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 34 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 8], m1 + mova [rsp+gprsize+16*11], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 45 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 3], m1 + mova [rsp+gprsize+16* 6], m5 + mova m5, m0 + call .h + punpcklwd m1, m5, m0 ; 56 + punpckhwd m5, m0 + mova [rsp+gprsize+16* 9], m1 + mova [rsp+gprsize+16*12], m5 + mova m5, m0 +.main2: + call .h +%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m4, [filterq+myq*8] ; a + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m2, [filterq+tmpq*8] ; b + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m3, [filterq+myq*8] ; c + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m1, [filterq+tmpq*8] ; d + lea tmpd, [myq+gammaq] + shr myd, 10 + punpcklwd m4, m2 + punpcklwd m3, m1 + punpckldq m2, m4, m3 + punpckhdq m4, m3 + punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 + pmaddwd m1, [rsp+gprsize+16*%1] + punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 + mova m2, [rsp+gprsize+16*%2] + pmaddwd m3, m2 + mova [rsp+gprsize+16*%1], m2 + paddd m1, m3 + punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 + mova m2, [rsp+gprsize+16*%3] + pmaddwd m3, m2 + mova [rsp+gprsize+16*%2], m2 + paddd m1, m3 + punpcklwd m3, m5, m0 ; 67 + punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 + pmaddwd m2, m3 + mova [rsp+gprsize+16*%3], m3 + paddd m1, m2 + movq m4, [filterq+myq*8] ; e + lea myd, [tmpq+gammaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] ; f + lea tmpd, [myq+gammaq] + shr myd, 10 + movq m2, [filterq+myq*8] ; g +%if ARCH_X86_64 + lea myd, [tmpq+deltaq] ; my += delta +%else + mov myd, [esp+gprsize+4*61] + add myd, tmpd +%endif + shr tmpd, 10 + punpcklwd m4, m3 + movq m3, [filterq+tmpq*8] ; h + punpcklwd m2, m3 + punpckldq m3, m4, m2 + punpckhdq m4, m2 + punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 + pmaddwd m2, [rsp+gprsize+16*%4] + punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 + mova m3, [rsp+gprsize+16*%5] + pmaddwd m6, m3 + mova [rsp+gprsize+16*%4], m3 + pxor m3, m3 + paddd m2, m6 + punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 + mova m6, [rsp+gprsize+16*%6] + pmaddwd m3, m6 + mova [rsp+gprsize+16*%5], m6 + punpckhwd m5, m0 + pxor m6, m6 + paddd m2, m3 + punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 + pmaddwd m3, m5 + mova [rsp+gprsize+16*%6], m5 + mova m5, m0 + paddd m2, m3 +%endmacro + WARP_V 1, 2, 3, 4, 5, 6 + ret +.main3: + call .h + WARP_V 7, 8, 9, 10, 11, 12 + ret +ALIGN function_align +.h: + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + punpcklbw m0, m6, m3 + movu m3, [srcq-6] + pmaddwd m0, m3 ; 0 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m2, m6, m3 + movu m3, [srcq-4] + pmaddwd m2, m3 ; 1 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m0, m2 ; 0 1 + punpcklbw m2, m6, m3 + movu m3, [srcq-2] + pmaddwd m2, m3 ; 2 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m1, m6, m3 + movu m3, [srcq+0] + pmaddwd m1, m3 ; 3 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m2, m1 ; 2 3 + punpcklbw m1, m6, m3 + movu m3, [srcq+2] + pmaddwd m1, m3 ; 4 + lea mxd, [tmpq+alphaq] + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + phaddd m0, m2 ; 0 1 2 3 + punpcklbw m2, m6, m3 + movu m3, [srcq+4] + pmaddwd m2, m3 ; 5 + lea tmpd, [mxq+alphaq] + shr mxd, 10 + movq m3, [filterq+mxq*8] + phaddd m1, m2 ; 4 5 + punpcklbw m2, m6, m3 + movu m3, [srcq+6] + pmaddwd m2, m3 ; 6 +%if ARCH_X86_64 + lea mxd, [tmpq+betaq] ; mx += beta +%else + mov mxd, [esp+gprsize*2+4*60] + add mxd, tmpd +%endif + shr tmpd, 10 + movq m3, [filterq+tmpq*8] + punpcklbw m4, m6, m3 + movu m3, [srcq+8] +%if ARCH_X86_64 + add srcq, ssq +%else + add srcq, [esp+gprsize*2+4*62] +%endif + pmaddwd m3, m4 ; 7 + phaddd m2, m3 ; 6 7 + phaddd m1, m2 ; 4 5 6 7 + paddd m0, m7 + paddd m1, m7 + psrad m0, [rsp+gprsize*2] + psrad m1, [rsp+gprsize*2] + packssdw m0, m1 + ret + +%macro BIDIR_FN 0 + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.ret: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jne .w8_loop + RET +.w16_loop: + call .main + add dstq, strideq +.w16: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + dec hd + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h +%define base r6-avg_ssse3_table + LEA r6, avg_ssse3_table + tzcnt wd, wm + mov t0d, r6m ; pixel_max + movsxd wq, [r6+wq*4] + shr t0d, 11 + movddup m2, [base+bidir_rnd+t0*8] + movddup m3, [base+bidir_mul+t0*8] + movifnidn hd, hm + add wq, r6 + BIDIR_FN +ALIGN function_align +.main: + mova m0, [tmp1q+16*0] + paddsw m0, [tmp2q+16*0] + mova m1, [tmp1q+16*1] + paddsw m1, [tmp2q+16*1] + add tmp1q, 16*2 + add tmp2q, 16*2 + pmaxsw m0, m2 + pmaxsw m1, m2 + psubsw m0, m2 + psubsw m1, m2 + pmulhw m0, m3 + pmulhw m1, m3 + ret + +cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h +%define base r6-w_avg_ssse3_table + LEA r6, w_avg_ssse3_table + tzcnt wd, wm + mov t0d, r6m ; weight + movd m6, r7m ; pixel_max + movddup m5, [base+pd_65538] + movsxd wq, [r6+wq*4] + pshufb m6, [base+pw_256] + add wq, r6 + lea r6d, [t0-16] + shl t0d, 16 + sub t0d, r6d ; 16-weight, weight + paddw m5, m6 + mov r6d, t0d + shl t0d, 2 + test dword r7m, 0x800 + cmovnz r6d, t0d + movifnidn hd, hm + movd m4, r6d + pslld m5, 7 + pxor m7, m7 + pshufd m4, m4, q0000 + BIDIR_FN +ALIGN function_align +.main: + mova m2, [tmp1q+16*0] + mova m0, [tmp2q+16*0] + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + mova m2, [tmp1q+16*1] + mova m1, [tmp2q+16*1] + add tmp1q, 16*2 + add tmp2q, 16*2 + pmaddwd m3, m4 + pmaddwd m0, m4 + paddd m3, m5 + paddd m0, m5 + psrad m3, 8 + psrad m0, 8 + packssdw m0, m3 + punpckhwd m3, m1, m2 + punpcklwd m1, m2 + pmaddwd m3, m4 + pmaddwd m1, m4 + paddd m3, m5 + paddd m1, m5 + psrad m3, 8 + psrad m1, 8 + packssdw m1, m3 + pminsw m0, m6 + pminsw m1, m6 + pmaxsw m0, m7 + pmaxsw m1, m7 + ret + +%if ARCH_X86_64 +cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask +%else +cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask +%define hd dword r5m +%define m8 [base+pw_64] +%endif +%define base r6-mask_ssse3_table + LEA r6, mask_ssse3_table + tzcnt wd, wm + mov t0d, r7m ; pixel_max + shr t0d, 11 + movsxd wq, [r6+wq*4] + movddup m6, [base+bidir_rnd+t0*8] + movddup m7, [base+bidir_mul+t0*8] +%if ARCH_X86_64 + mova m8, [base+pw_64] + movifnidn hd, hm +%endif + add wq, r6 + mov maskq, r6mp + BIDIR_FN +ALIGN function_align +.main: + movq m3, [maskq+8*0] + mova m0, [tmp1q+16*0] + mova m4, [tmp2q+16*0] + pxor m5, m5 + punpcklbw m3, m5 + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + psubw m1, m8, m3 + punpckhwd m4, m3, m1 ; m, 64-m + punpcklwd m3, m1 + pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m0, m3 + movq m3, [maskq+8*1] + mova m1, [tmp1q+16*1] + mova m4, [tmp2q+16*1] + add maskq, 8*2 + add tmp1q, 16*2 + add tmp2q, 16*2 + psrad m2, 5 + psrad m0, 5 + packssdw m0, m2 + punpcklbw m3, m5 + punpckhwd m2, m1, m4 + punpcklwd m1, m4 + psubw m5, m8, m3 + punpckhwd m4, m3, m5 ; m, 64-m + punpcklwd m3, m5 + pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) + pmaddwd m1, m3 + psrad m2, 5 + psrad m1, 5 + packssdw m1, m2 + pmaxsw m0, m6 + pmaxsw m1, m6 + psubsw m0, m6 + psubsw m1, m6 + pmulhw m0, m7 + pmulhw m1, m7 + ret + +cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_420_ssse3_table + LEA t0, w_mask_420_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd m0, r7m ; sign + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m4, [base+bidir_mul+r6*8] + ALLOC_STACK -16*4 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + mova [rsp+16*3], m4 + %define m8 [rsp+gprsize+16*0] + %define m9 [rsp+gprsize+16*1] + %define m10 [rsp+gprsize+16*2] + %define m11 [rsp+gprsize+16*3] +%endif + movd m7, [base+pw_2] + psubw m7, m0 + pshufb m7, [base+pw_256] + add wq, t0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w4: + movq [dstq+strideq*0], m0 + phaddw m2, m3 + movhps [dstq+strideq*1], m0 + phaddd m2, m2 + lea dstq, [dstq+strideq*2] + paddw m2, m7 + movq [dstq+strideq*0], m1 + psrlw m2, 2 + movhps [dstq+strideq*1], m1 + packuswb m2, m2 + movd [maskq], m2 + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 4 +.w8: + mova [dstq+strideq*0], m0 + paddw m2, m3 + phaddw m2, m2 + mova [dstq+strideq*1], m1 + paddw m2, m7 + psrlw m2, 2 + packuswb m2, m2 + movd [maskq], m2 + sub hd, 2 + jg .w8_loop + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 8 +.w16: + mova [dstq+strideq*1+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*1], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + paddw m2, [dstq+strideq*1+16*0] + paddw m3, [dstq+strideq*1+16*1] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*1], m1 + paddw m2, m7 + psrlw m2, 2 + packuswb m2, m2 + movq [maskq], m2 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16 +.w32: + mova [dstq+strideq*1+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*1], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*0+16*2], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*3], m1 + call .main + paddw m2, [dstq+strideq*1+16*0] + paddw m3, [dstq+strideq*1+16*1] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*1], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*2] + paddw m2, [dstq+strideq*1+16*3] + mova [dstq+strideq*1+16*2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*3], m1 + packuswb m3, m2 + mova [maskq], m3 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16*2 +.w64: + mova [dstq+strideq*1+16*1], m2 + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*1+16*2], m3 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*2], m0 + mova [dstq+strideq*1+16*4], m3 + mova [dstq+strideq*0+16*3], m1 + call .main + mova [dstq+strideq*1+16*5], m2 + mova [dstq+strideq*0+16*4], m0 + mova [dstq+strideq*1+16*6], m3 + mova [dstq+strideq*0+16*5], m1 + call .main + mova [dstq+strideq*0+16*6], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*7], m2 + mova [dstq+strideq*0+16*7], m1 + call .main + paddw m2, [dstq+strideq*1+16*1] + paddw m3, [dstq+strideq*1+16*2] + mova [dstq+strideq*1+16*0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*2], m2 + mova [dstq+strideq*1+16*1], m1 + call .main + paddw m2, [dstq+strideq*1+16*3] + paddw m3, [dstq+strideq*1+16*4] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*2] + mova [dstq+strideq*1+16*2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*3], m1 + packuswb m3, m2 + mova [maskq+16*0], m3 + call .main + paddw m2, [dstq+strideq*1+16*5] + paddw m3, [dstq+strideq*1+16*6] + mova [dstq+strideq*1+16*4], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*6], m2 + mova [dstq+strideq*1+16*5], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*6] + paddw m2, [dstq+strideq*1+16*7] + mova [dstq+strideq*1+16*6], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*7], m1 + packuswb m3, m2 + mova [maskq+16*1], m3 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + lea dstq, [dstq+strideq*2] + add maskq, 16*4 +.w128: + mova [dstq+strideq*1+16* 1], m2 + mova [dstq+strideq*0+16* 0], m0 + mova [dstq+strideq*1+16* 2], m3 + mova [dstq+strideq*0+16* 1], m1 + call .main + mova [dstq+strideq*1+16* 3], m2 + mova [dstq+strideq*0+16* 2], m0 + mova [dstq+strideq*1+16* 4], m3 + mova [dstq+strideq*0+16* 3], m1 + call .main + mova [dstq+strideq*1+16* 5], m2 + mova [dstq+strideq*0+16* 4], m0 + mova [dstq+strideq*1+16* 6], m3 + mova [dstq+strideq*0+16* 5], m1 + call .main + mova [dstq+strideq*1+16* 7], m2 + mova [dstq+strideq*0+16* 6], m0 + mova [dstq+strideq*1+16* 8], m3 + mova [dstq+strideq*0+16* 7], m1 + call .main + mova [dstq+strideq*1+16* 9], m2 + mova [dstq+strideq*0+16* 8], m0 + mova [dstq+strideq*1+16*10], m3 + mova [dstq+strideq*0+16* 9], m1 + call .main + mova [dstq+strideq*1+16*11], m2 + mova [dstq+strideq*0+16*10], m0 + mova [dstq+strideq*1+16*12], m3 + mova [dstq+strideq*0+16*11], m1 + call .main + mova [dstq+strideq*1+16*13], m2 + mova [dstq+strideq*0+16*12], m0 + mova [dstq+strideq*1+16*14], m3 + mova [dstq+strideq*0+16*13], m1 + call .main + mova [dstq+strideq*0+16*14], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*15], m2 + mova [dstq+strideq*0+16*15], m1 + call .main + paddw m2, [dstq+strideq*1+16* 1] + paddw m3, [dstq+strideq*1+16* 2] + mova [dstq+strideq*1+16* 0], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16* 2], m2 + mova [dstq+strideq*1+16* 1], m1 + call .main + paddw m2, [dstq+strideq*1+16* 3] + paddw m3, [dstq+strideq*1+16* 4] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16* 2] + mova [dstq+strideq*1+16* 2], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16* 3], m1 + packuswb m3, m2 + mova [maskq+16*0], m3 + call .main + paddw m2, [dstq+strideq*1+16* 5] + paddw m3, [dstq+strideq*1+16* 6] + mova [dstq+strideq*1+16* 4], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16* 6], m2 + mova [dstq+strideq*1+16* 5], m1 + call .main + paddw m2, [dstq+strideq*1+16* 7] + paddw m3, [dstq+strideq*1+16* 8] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16* 6] + mova [dstq+strideq*1+16* 6], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16* 7], m1 + packuswb m3, m2 + mova [maskq+16*1], m3 + call .main + paddw m2, [dstq+strideq*1+16* 9] + paddw m3, [dstq+strideq*1+16*10] + mova [dstq+strideq*1+16* 8], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*10], m2 + mova [dstq+strideq*1+16* 9], m1 + call .main + paddw m2, [dstq+strideq*1+16*11] + paddw m3, [dstq+strideq*1+16*12] + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*10] + mova [dstq+strideq*1+16*10], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*11], m1 + packuswb m3, m2 + mova [maskq+16*2], m3 + call .main + paddw m2, [dstq+strideq*1+16*13] + paddw m3, [dstq+strideq*1+16*14] + mova [dstq+strideq*1+16*12], m0 + phaddw m2, m3 + mova [dstq+strideq*1+16*14], m2 + mova [dstq+strideq*1+16*13], m1 + call .main + phaddw m2, m3 + paddw m3, m7, [dstq+strideq*1+16*14] + paddw m2, [dstq+strideq*1+16*15] + mova [dstq+strideq*1+16*14], m0 + paddw m2, m7 + psrlw m3, 2 + psrlw m2, 2 + mova [dstq+strideq*1+16*15], m1 + packuswb m3, m2 + mova [maskq+16*3], m3 + sub hd, 2 + jg .w128_loop + RET +ALIGN function_align +.main: +%macro W_MASK 2 ; dst/tmp_offset, mask + mova m%1, [tmp1q+16*%1] + mova m%2, [tmp2q+16*%1] + punpcklwd m4, m%2, m%1 + punpckhwd m5, m%2, m%1 + psubsw m%1, m%2 + pabsw m%1, m%1 + psubusw m6, m8, m%1 + psrlw m6, 10 ; 64-m + psubw m%2, m9, m6 ; m + punpcklwd m%1, m6, m%2 + punpckhwd m6, m%2 + pmaddwd m%1, m4 + pmaddwd m6, m5 + psrad m%1, 5 + psrad m6, 5 + packssdw m%1, m6 + pmaxsw m%1, m10 + psubsw m%1, m10 + pmulhw m%1, m11 +%endmacro + W_MASK 0, 2 + W_MASK 1, 3 + add tmp1q, 16*2 + add tmp2q, 16*2 + ret + +cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_422_ssse3_table + LEA t0, w_mask_422_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + movd m7, r7m ; sign + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m4, [base+bidir_mul+r6*8] + ALLOC_STACK -16*4 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + mova [rsp+16*3], m4 +%endif + pxor m0, m0 + add wq, t0 + pshufb m7, m0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2 + W_MASK 1, 3 + phaddw m2, m3 + add tmp1q, 16*2 + add tmp2q, 16*2 + packuswb m2, m2 + pxor m3, m3 + psubb m2, m7 + pavgb m2, m3 + movq [maskq], m2 + add maskq, 8 + ret + +cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_444_ssse3_table + LEA t0, w_mask_444_ssse3_table + tzcnt wd, wm + mov r6d, r8m ; pixel_max + shr r6d, 11 + movsxd wq, [t0+wq*4] +%if ARCH_X86_64 + mova m8, [base+pw_27615] + mova m9, [base+pw_64] + movddup m10, [base+bidir_rnd+r6*8] + movddup m11, [base+bidir_mul+r6*8] +%else + mova m1, [base+pw_27615] + mova m2, [base+pw_64] + movddup m3, [base+bidir_rnd+r6*8] + movddup m7, [base+bidir_mul+r6*8] + ALLOC_STACK -16*3 + mova [rsp+16*0], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + %define m11 m7 +%endif + add wq, t0 + movifnidn hd, r5m + mov maskq, r6mp + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop +.end: + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + sub hd, 2 + jg .w8_loop +.w8_end: + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0+16*0], m0 + mova [dstq+strideq*0+16*1], m1 + call .main + mova [dstq+strideq*1+16*0], m0 + mova [dstq+strideq*1+16*1], m1 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + call .main + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + call .main + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + call .main + mova [dstq+16*6], m0 + mova [dstq+16*7], m1 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16* 0], m0 + mova [dstq+16* 1], m1 + call .main + mova [dstq+16* 2], m0 + mova [dstq+16* 3], m1 + call .main + mova [dstq+16* 4], m0 + mova [dstq+16* 5], m1 + call .main + mova [dstq+16* 6], m0 + mova [dstq+16* 7], m1 + call .main + mova [dstq+16* 8], m0 + mova [dstq+16* 9], m1 + call .main + mova [dstq+16*10], m0 + mova [dstq+16*11], m1 + call .main + mova [dstq+16*12], m0 + mova [dstq+16*13], m1 + call .main + mova [dstq+16*14], m0 + mova [dstq+16*15], m1 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + W_MASK 0, 2 + W_MASK 1, 3 + packuswb m2, m3 + add tmp1q, 16*2 + add tmp2q, 16*2 + mova [maskq], m2 + add maskq, 16 + ret + +; (a * (64 - m) + b * m + 32) >> 6 +; = (((b - a) * m + 32) >> 6) + a +; = (((b - a) * (m << 9) + 16384) >> 15) + a +; except m << 9 overflows int16_t when m == 64 (which is possible), +; but if we negate m it works out (-64 << 9 == -32768). +; = (((a - b) * (m * -512) + 16384) >> 15) + a +cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 +%define base r6-blend_ssse3_table + LEA r6, blend_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r6+wq*4] + movifnidn maskq, maskmp + mova m7, [base+pw_m512] + add wq, r6 + lea stride3q, [strideq*3] + pxor m6, m6 + jmp wq +.w4: + mova m5, [maskq] + movq m0, [dstq+strideq*0] + movhps m0, [dstq+strideq*1] + movq m1, [dstq+strideq*2] + movhps m1, [dstq+stride3q ] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w4 + RET +.w8: + mova m5, [maskq] + mova m0, [dstq+strideq*0] + mova m1, [dstq+strideq*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8 + RET +.w16: + mova m5, [maskq] + mova m0, [dstq+16*0] + mova m1, [dstq+16*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + add maskq, 16 + add tmpq, 32 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16 + RET +.w32: + mova m5, [maskq+16*0] + mova m0, [dstq+16*0] + mova m1, [dstq+16*1] + psubw m2, m0, [tmpq+16*0] + psubw m3, m1, [tmpq+16*1] + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova m5, [maskq+16*1] + mova m0, [dstq+16*2] + mova m1, [dstq+16*3] + psubw m2, m0, [tmpq+16*2] + psubw m3, m1, [tmpq+16*3] + add maskq, 32 + add tmpq, 64 + punpcklbw m4, m5, m6 + punpckhbw m5, m6 + pmullw m4, m7 + pmullw m5, m7 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + add dstq, strideq + dec hd + jg .w32 + RET + +cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h +%define base r5-blend_v_ssse3_table + LEA r5, blend_v_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + add wq, r5 + jmp wq +.w2: + movd m4, [base+obmc_masks+2*2] +.w2_loop: + movd m0, [dstq+strideq*0] + movd m2, [tmpq+4*0] + movd m1, [dstq+strideq*1] + movd m3, [tmpq+4*1] + add tmpq, 4*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + movd [dstq+strideq*0], m0 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w2_loop + RET +.w4: + movddup m2, [base+obmc_masks+4*2] +.w4_loop: + movq m0, [dstq+strideq*0] + movhps m0, [dstq+strideq*1] + mova m1, [tmpq] + add tmpq, 8*2 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + mova m4, [base+obmc_masks+8*2] +.w8_loop: + mova m0, [dstq+strideq*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+strideq*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddw m0, m2 + paddw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + mova m4, [base+obmc_masks+16*2] + movq m5, [base+obmc_masks+16*3] +.w16_loop: + mova m0, [dstq+16*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+16*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd + jg .w16_loop + RET +.w32: +%if WIN64 + movaps [rsp+8], m6 +%endif + mova m4, [base+obmc_masks+16*4] + mova m5, [base+obmc_masks+16*5] + mova m6, [base+obmc_masks+16*6] +.w32_loop: + mova m0, [dstq+16*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+16*1] + mova m3, [tmpq+16*1] + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m5 + paddw m0, m2 + mova m2, [dstq+16*2] + paddw m1, m3 + mova m3, [tmpq+16*2] + add tmpq, 16*4 + psubw m3, m2 + pmulhrsw m3, m6 + paddw m2, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + add dstq, strideq + dec hd + jg .w32_loop +%if WIN64 + movaps m6, [rsp+8] +%endif + RET + +%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp + mova m0, [dstq+16*(%1+0)] + mova m2, [tmpq+16*(%2+0)] + mova m1, [dstq+16*(%1+1)] + mova m3, [tmpq+16*(%2+1)] +%if %3 + add tmpq, 16*%3 +%endif + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + paddw m0, m2 + paddw m1, m3 + mova [dstq+16*(%1+0)], m0 + mova [dstq+16*(%1+1)], m1 +%endmacro + +cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask +%define base r6-blend_h_ssse3_table + LEA r6, blend_h_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + movddup m4, [base+blend_shuf] + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + add wq, r6 + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd m0, [dstq+dsq*0] + movd m2, [dstq+dsq*1] + movd m3, [maskq+hq*2] + movq m1, [tmpq] + add tmpq, 4*2 + punpckldq m0, m2 + punpcklwd m3, m3 + psubw m1, m0 + pmulhrsw m1, m3 + paddw m0, m1 + movd [dstq+dsq*0], m0 + psrlq m0, 32 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova m3, [base+blend_shuf] +.w4_loop: + movq m0, [dstq+dsq*0] + movhps m0, [dstq+dsq*1] + movd m2, [maskq+hq*2] + mova m1, [tmpq] + add tmpq, 8*2 + psubw m1, m0 + pshufb m2, m3 + pmulhrsw m1, m2 + paddw m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +.w8: + movddup m5, [base+blend_shuf+8] +%if WIN64 + movaps [rsp+ 8], m6 + movaps [rsp+24], m7 +%endif +.w8_loop: + movd m7, [maskq+hq*2] + mova m0, [dstq+dsq*0] + mova m2, [tmpq+16*0] + mova m1, [dstq+dsq*1] + mova m3, [tmpq+16*1] + add tmpq, 16*2 + pshufb m6, m7, m4 + psubw m2, m0 + pshufb m7, m5 + psubw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m3, m7 + paddw m0, m2 + paddw m1, m3 + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop +%if WIN64 + movaps m6, [rsp+ 8] + movaps m7, [rsp+24] +%endif + RET +.w16: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0, 2 + add dstq, dsq + inc hq + jl .w16 + RET +.w32: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2, 4 + add dstq, dsq + inc hq + jl .w32 + RET +.w64: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2 + BLEND_H_ROW 4, 4 + BLEND_H_ROW 6, 6, 8 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + movd m5, [maskq+hq*2] + pshufb m5, m4 + BLEND_H_ROW 0, 0 + BLEND_H_ROW 2, 2 + BLEND_H_ROW 4, 4 + BLEND_H_ROW 6, 6, 16 + BLEND_H_ROW 8, -8 + BLEND_H_ROW 10, -6 + BLEND_H_ROW 12, -4 + BLEND_H_ROW 14, -2 + add dstq, dsq + inc hq + jl .w128 + RET + +; emu_edge args: +; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, +; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, +; const pixel *ref, const ptrdiff_t ref_stride +; +; bw, bh total filled size +; iw, ih, copied block -> fill bottom, right +; x, y, offset in bw/bh -> fill top, left +cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ + y, dst, dstride, src, sstride, \ + bottomext, rightext, blk + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + +%if ARCH_X86_64 + %define reg_zero r12q + %define reg_tmp r10 + %define reg_src srcq + %define reg_bottomext bottomextq + %define reg_rightext rightextq + %define reg_blkm r9m +%else + %define reg_zero r6 + %define reg_tmp r0 + %define reg_src r1 + %define reg_bottomext r0 + %define reg_rightext r1 + %define reg_blkm r2m +%endif + ; + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor reg_zero, reg_zero + lea reg_tmp, [ihq-1] + cmp yq, ihq + cmovs reg_tmp, yq + test yq, yq + cmovs reg_tmp, reg_zero +%if ARCH_X86_64 + imul reg_tmp, sstrideq + add srcq, reg_tmp +%else + imul reg_tmp, sstridem + mov reg_src, srcm + add reg_src, reg_tmp +%endif + ; + ; ref += iclip(x, 0, iw - 1) + lea reg_tmp, [iwq-1] + cmp xq, iwq + cmovs reg_tmp, xq + test xq, xq + cmovs reg_tmp, reg_zero + lea reg_src, [reg_src+reg_tmp*2] +%if ARCH_X86_32 + mov srcm, reg_src +%endif + ; + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) +%if ARCH_X86_32 + mov r1, r1m ; restore bh +%endif + lea reg_bottomext, [yq+bhq] + sub reg_bottomext, ihq + lea r3, [bhq-1] + cmovs reg_bottomext, reg_zero + ; + + DEFINE_ARGS bw, bh, iw, ih, x, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, reg_zero + cmp reg_bottomext, bhq + cmovns reg_bottomext, r3 + cmp topextq, bhq + cmovg topextq, r3 + %if ARCH_X86_32 + mov r4m, reg_bottomext + ; + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + mov r0, r0m ; restore bw + %endif + lea reg_rightext, [xq+bwq] + sub reg_rightext, iwq + lea r2, [bwq-1] + cmovs reg_rightext, reg_zero + + DEFINE_ARGS bw, bh, iw, ih, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, reg_zero + cmp reg_rightext, bwq + cmovns reg_rightext, r2 + %if ARCH_X86_32 + mov r3m, r1 + %endif + cmp leftextq, bwq + cmovns leftextq, r2 + +%undef reg_zero +%undef reg_tmp +%undef reg_src +%undef reg_bottomext +%undef reg_rightext + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; center_h = bh - top_ext - bottom_ext +%if ARCH_X86_64 + lea r3, [bottomextq+topextq] + sub centerhq, r3 +%else + mov r1, centerhm ; restore r1 + sub centerhq, topextq + sub centerhq, r4m + mov r1m, centerhq +%endif + ; + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq +%if ARCH_X86_64 + imul r2, dstrideq +%else + mov r6, r6m ; restore dstq + imul r2, dstridem +%endif + add dstq, r2 + mov reg_blkm, dstq ; save pointer for ext + ; + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq +%if ARCH_X86_64 + lea r3, [rightextq+leftextq] + sub centerwq, r3 +%else + sub centerwq, r3m + sub centerwq, leftextq +%endif + +; vloop Macro +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix + %if ARCH_X86_64 + %define reg_tmp r12 + %else + %define reg_tmp r0 + %endif +.v_loop_%3: + %if ARCH_X86_32 + mov r0, r0m + mov r1, r1m + %endif +%if %1 + ; left extension + %if ARCH_X86_64 + movd m0, [srcq] + %else + mov r3, srcm + movd m0, [r3] + %endif + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + xor r3, r3 +.left_loop_%3: + mova [dstq+r3*2], m0 + add r3, mmsize/2 + cmp r3, leftextq + jl .left_loop_%3 + ; body + lea reg_tmp, [dstq+leftextq*2] +%endif + xor r3, r3 +.body_loop_%3: + %if ARCH_X86_64 + movu m0, [srcq+r3*2] + %else + mov r1, srcm + movu m0, [r1+r3*2] + %endif +%if %1 + movu [reg_tmp+r3*2], m0 +%else + movu [dstq+r3*2], m0 +%endif + add r3, mmsize/2 + cmp r3, centerwq + jl .body_loop_%3 +%if %2 + ; right extension +%if %1 + lea reg_tmp, [reg_tmp+centerwq*2] +%else + lea reg_tmp, [dstq+centerwq*2] +%endif + %if ARCH_X86_64 + movd m0, [srcq+centerwq*2-2] + %else + mov r3, srcm + movd m0, [r3+centerwq*2-2] + %endif + pshuflw m0, m0, q0000 + punpcklqdq m0, m0 + xor r3, r3 +.right_loop_%3: + movu [reg_tmp+r3*2], m0 + add r3, mmsize/2 + %if ARCH_X86_64 + cmp r3, rightextq + %else + cmp r3, r3m + %endif + jl .right_loop_%3 +%endif + %if ARCH_X86_64 + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 + %else + add dstq, dstridem + mov r0, sstridem + add srcm, r0 + sub dword centerhm, 1 + jg .v_loop_%3 + mov r0, r0m ; restore r0 + %endif +%endmacro ; vloop MACRO + + test leftextq, leftextq + jnz .need_left_ext + %if ARCH_X86_64 + test rightextq, rightextq + jnz .need_right_ext + %else + cmp leftextq, r3m ; leftextq == 0 + jne .need_right_ext + %endif + v_loop 0, 0, 0 + jmp .body_done + + ;left right extensions +.need_left_ext: + %if ARCH_X86_64 + test rightextq, rightextq + %else + mov r3, r3m + test r3, r3 + %endif + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: +; r0 ; bw +; r1 ;; x loop +; r4 ;; y loop +; r5 ; topextq +; r6 ;dstq +; r7 ;dstrideq +; r8 ; srcq +%if ARCH_X86_64 + %define reg_dstride dstrideq +%else + %define reg_dstride r2 +%endif + ; + ; bottom edge extension + %if ARCH_X86_64 + test bottomextq, bottomextq + jz .top + %else + xor r1, r1 + cmp r1, r4m + je .top + %endif + ; + %if ARCH_X86_64 + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 + %else + mov r3, dstq + mov reg_dstride, dstridem + sub r3, reg_dstride + mov srcm, r3 + %endif + ; +.bottom_x_loop: + %if ARCH_X86_64 + mova m0, [srcq+r1*2] + lea r3, [dstq+r1*2] + mov r4, bottomextq + %else + mov r3, srcm + mova m0, [r3+r1*2] + lea r3, [dstq+r1*2] + mov r4, r4m + %endif + ; +.bottom_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .bottom_y_loop + add r1, mmsize/2 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end +%if ARCH_X86_64 + mov srcq, reg_blkm +%else + mov r3, reg_blkm + mov reg_dstride, dstridem +%endif + mov dstq, dstm + xor r1, r1 + ; +.top_x_loop: +%if ARCH_X86_64 + mova m0, [srcq+r1*2] +%else + mov r3, reg_blkm + mova m0, [r3+r1*2] +%endif + lea r3, [dstq+r1*2] + mov r4, topextq + ; +.top_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .top_y_loop + add r1, mmsize/2 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +%undef reg_dstride +%undef reg_blkm +%undef reg_tmp + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +%if ARCH_X86_64 +cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0, pxmax +%elif STACK_ALIGNMENT >= 16 +cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0, pxmax +%else +cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0, pxmax +%endif + movifnidn dstq, dstmp + movifnidn srcq, srcmp +%if STACK_ALIGNMENT >= 16 + movifnidn dst_wd, dst_wm +%endif +%if ARCH_X86_64 + movifnidn hd, hm +%endif + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + movd m4, pxmaxm + movd m7, dxm + movd m6, mx0m + movd m5, src_wm + punpcklwd m4, m4 + pshufd m4, m4, q0000 + pshufd m7, m7, q0000 + pshufd m6, m6, q0000 + pshufd m5, m5, q0000 + mova [rsp+16*3*ARCH_X86_32], m4 +%if ARCH_X86_64 + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x + LEA r7, $$ + %define base r7-$$ +%else + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x + %define hd dword r5m + %if STACK_ALIGNMENT >= 16 + LEA r6, $$ + %define base r6-$$ + %else + LEA r4, $$ + %define base r4-$$ + %endif +%endif +%if ARCH_X86_64 + mova m12, [base+pd_64] + mova m11, [base+pd_63] +%else + %define m12 [base+pd_64] + %define m11 [base+pd_63] +%endif + pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] + pslld m7, 2 ; dx*4 + pslld m5, 14 + paddd m6, m4 ; mx+[0..3]*dx + SCRATCH 7, 15, 0 + SCRATCH 6, 14, 1 + SCRATCH 5, 13, 2 + pxor m1, m1 +.loop_y: + xor xd, xd + mova m0, m14 ; per-line working version of mx +.loop_x: + pcmpgtd m1, m0 + pandn m1, m0 + psrad m2, m0, 8 ; filter offset (unmasked) + pcmpgtd m3, m13, m1 + pand m1, m3 + pandn m3, m13 + por m1, m3 + psubd m3, m0, m1 ; pshufb offset + psrad m1, 14 ; clipped src_x offset + psrad m3, 14 ; pshufb edge_emu offset + pand m2, m11 ; filter offset (masked) + ; load source pixels +%if ARCH_X86_64 + movd r8d, m1 + pshuflw m1, m1, q3232 + movd r9d, m1 + punpckhqdq m1, m1 + movd r10d, m1 + psrlq m1, 32 + movd r11d, m1 + movu m4, [srcq+r8*2] + movu m5, [srcq+r9*2] + movu m6, [srcq+r10*2] + movu m7, [srcq+r11*2] + ; if no emulation is required, we don't need to shuffle or emulate edges + packssdw m3, m3 + movq r11, m3 + test r11, r11 + jz .filter + movsx r8, r11w + sar r11, 16 + movsx r9, r11w + sar r11, 16 + movsx r10, r11w + sar r11, 16 + movu m1, [base+resize_shuf+8+r8*2] + movu m3, [base+resize_shuf+8+r9*2] + movu m8, [base+resize_shuf+8+r10*2] + movu m9, [base+resize_shuf+8+r11*2] + pshufb m4, m1 + pshufb m5, m3 + pshufb m6, m8 + pshufb m7, m9 +.filter: + movd r8d, m2 + pshuflw m2, m2, q3232 + movd r9d, m2 + punpckhqdq m2, m2 + movd r10d, m2 + psrlq m2, 32 + movd r11d, m2 + movq m8, [base+resize_filter+r8*8] + movq m2, [base+resize_filter+r9*8] + pxor m9, m9 + punpcklbw m1, m9, m8 + punpcklbw m3, m9, m2 + psraw m1, 8 + psraw m3, 8 + movq m10, [base+resize_filter+r10*8] + movq m2, [base+resize_filter+r11*8] + punpcklbw m8, m9, m10 + punpcklbw m9, m2 + psraw m8, 8 + psraw m9, 8 + pmaddwd m4, m1 + pmaddwd m5, m3 + pmaddwd m6, m8 + pmaddwd m7, m9 + phaddd m4, m5 +%else + movd r3, m1 + pshuflw m1, m1, q3232 + movd r1, m1 + punpckhqdq m1, m1 + movu m4, [srcq+r3*2] + movu m5, [srcq+r1*2] + movd r3, m1 + psrlq m1, 32 + movd r1, m1 + movu m6, [srcq+r3*2] + movu m7, [srcq+r1*2] + ; if no emulation is required, we don't need to shuffle or emulate edges + pxor m1, m1 + pcmpeqb m1, m3 + pmovmskb r3d, m1 + cmp r3d, 0xffff + je .filter + movd r3, m3 + movu m1, [base+resize_shuf+8+r3*2] + pshuflw m3, m3, q3232 + movd r1, m3 + pshufb m4, m1 + movu m1, [base+resize_shuf+8+r1*2] + punpckhqdq m3, m3 + movd r3, m3 + pshufb m5, m1 + movu m1, [base+resize_shuf+8+r3*2] + psrlq m3, 32 + movd r1, m3 + pshufb m6, m1 + movu m1, [base+resize_shuf+8+r1*2] + pshufb m7, m1 +.filter: + mova [esp+4*16], m6 + mova [esp+5*16], m7 + movd r3, m2 + pshuflw m2, m2, q3232 + movd r1, m2 + movq m6, [base+resize_filter+r3*8] + movq m7, [base+resize_filter+r1*8] + pxor m3, m3 + punpcklbw m1, m3, m6 + punpcklbw m3, m7 + psraw m1, 8 + psraw m3, 8 + pmaddwd m4, m1 + pmaddwd m5, m3 + punpckhqdq m2, m2 + movd r3, m2 + psrlq m2, 32 + movd r1, m2 + phaddd m4, m5 + movq m2, [base+resize_filter+r3*8] + movq m5, [base+resize_filter+r1*8] + mova m6, [esp+4*16] + mova m7, [esp+5*16] + pxor m3, m3 + punpcklbw m1, m3, m2 + punpcklbw m3, m5 + psraw m1, 8 + psraw m3, 8 + pmaddwd m6, m1 + pmaddwd m7, m3 +%endif + phaddd m6, m7 + phaddd m4, m6 + pxor m1, m1 + psubd m2, m12, m4 + psrad m2, 7 + packssdw m2, m2 + pmaxsw m2, m1 + pminsw m2, [rsp+16*3*ARCH_X86_32] + movq [dstq+xq*2], m2 + paddd m0, m15 + add xd, 4 +%if STACK_ALIGNMENT >= 16 + cmp xd, dst_wd +%else + cmp xd, dst_wm +%endif + jl .loop_x + add dstq, dst_stridemp + add srcq, src_stridemp + dec hd + jg .loop_y + RET diff --git a/third_party/dav1d/src/x86/mc_avx2.asm b/third_party/dav1d/src/x86/mc_avx2.asm new file mode 100644 index 0000000000..3b208033bd --- /dev/null +++ b/third_party/dav1d/src/x86/mc_avx2.asm @@ -0,0 +1,5669 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018-2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +; dav1d_obmc_masks[] with 64-x interleaved +obmc_masks: db 0, 0, 0, 0 + ; 2 + db 45, 19, 64, 0 + ; 4 + db 39, 25, 50, 14, 59, 5, 64, 0 + ; 8 + db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 + ; 16 + db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 + db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 + ; 32 + db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 + db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 + db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 + db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 + +warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 + db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 +warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 + db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 +subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 +subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 +deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 +bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 +wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 +resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 + +wm_420_sign: dd 0x01020102, 0x01010101 +wm_422_sign: dd 0x80808080, 0x7f7f7f7f + +pb_64: times 4 db 64 +pw_m256: times 2 dw -256 +pw_15: times 2 dw 15 +pw_32: times 2 dw 32 +pw_34: times 2 dw 34 +pw_258: times 2 dw 258 +pw_512: times 2 dw 512 +pw_1024: times 2 dw 1024 +pw_2048: times 2 dw 2048 +pw_6903: times 2 dw 6903 +pw_8192: times 2 dw 8192 +pd_32: dd 32 +pd_63: dd 63 +pd_512: dd 512 +pd_32768: dd 32768 +pd_0x3ff: dd 0x3ff +pd_0x4000: dd 0x4000 +pq_0x40000000: dq 0x40000000 + +cextern mc_subpel_filters +cextern mc_warp_filter2 +cextern resize_filter + +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%macro SCALED_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) +%%table: + %rep %0 - 2 + dw %%base %+ .w%3 - %%base + %rotate 1 + %endrep + %rotate 2 +%%dy_1024: + %xdefine %1_%2_dy1_table (%%dy_1024 - %3) + %rep %0 - 2 + dw %%base %+ .dy1_w%3 - %%base + %rotate 1 + %endrep + %rotate 2 +%%dy_2048: + %xdefine %1_%2_dy2_table (%%dy_2048 - %3) + %rep %0 - 2 + dw %%base %+ .dy2_w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put) +%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep) + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32 + +SECTION .text + +INIT_XMM avx2 +cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy + movifnidn mxyd, r6m ; mx + lea r7, [put_avx2] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx wd, word [r7+wq*2+table_offset(put,)] + add wq, r7 + jmp wq +.put_w2: + movzx r6d, word [srcq+ssq*0] + movzx r7d, word [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6w + mov [dstq+dsq*1], r7w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +INIT_YMM avx2 +.put_w32: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+ssq*0+32*0] + movu m1, [srcq+ssq*0+32*1] + movu m2, [srcq+ssq*1+32*0] + movu m3, [srcq+ssq*1+32*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+32*0], m0 + mova [dstq+dsq*0+32*1], m1 + mova [dstq+dsq*1+32*0], m2 + mova [dstq+dsq*1+32*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] + movu m2, [srcq+32*2] + movu m3, [srcq+32*3] + add srcq, ssq + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + mova [dstq+32*2], m2 + mova [dstq+32*3], m3 + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 + ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 + imul mxyd, 255 + vbroadcasti128 m4, [bilin_h_shuf8] + add mxyd, 16 + movd xm5, mxyd + mov mxyd, r7m ; my + vpbroadcastw m5, xm5 + test mxyd, mxyd + jnz .hv + movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] + vpbroadcastd m3, [pw_2048] + add wq, r7 + jmp wq +.h_w2: + movd xm0, [srcq+ssq*0] + pinsrd xm0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 + pmulhrsw xm0, xm3 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + mova xm4, [bilin_h_shuf4] +.h_w4_loop: + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 + pmulhrsw xm0, xm3 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+ssq*0] + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pshufb xm1, xm4 + pmaddubsw xm0, xm5 + pmaddubsw xm1, xm5 + pmulhrsw xm0, xm3 + pmulhrsw xm1, xm3 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*1+8*0], 1 + movu xm1, [srcq+ssq*0+8*1] + vinserti128 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w32 + RET +.h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + movu m1, [srcq+8*4] + movu m2, [srcq+8*5] + add srcq, ssq + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + packuswb m1, m2 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + mov r6, -32*3 +.h_w128_loop: + movu m0, [srcq+r6+32*3+8*0] + movu m1, [srcq+r6+32*3+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+r6+32*3], m0 + add r6, 32 + jle .h_w128_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] + imul mxyd, 255 + vpbroadcastd m5, [pw_2048] + add mxyd, 16 + add wq, r7 + movd xm4, mxyd + vpbroadcastw m4, xm4 + jmp wq +.v_w2: + movd xm0, [srcq+ssq*0] +.v_w2_loop: + pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 + lea srcq, [srcq+ssq*2] + pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 + pshuflw xm1, xm1, q2301 ; 1 0 + punpcklbw xm1, xm0 + pmaddubsw xm1, xm4 + pmulhrsw xm1, xm5 + packuswb xm1, xm1 + pextrw [dstq+dsq*0], xm1, 1 + pextrw [dstq+dsq*1], xm1, 0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xm0, [srcq+ssq*0] +.v_w4_loop: + vpbroadcastd xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm1, xm2, xm0, 0x01 ; 0 1 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm2, xm0, 0x02 ; 1 2 + punpcklbw xm1, xm2 + pmaddubsw xm1, xm4 + pmulhrsw xm1, xm5 + packuswb xm1, xm1 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xm0, [srcq+ssq*0] +.v_w8_loop: + movq xm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw xm1, xm0, xm2 + movq xm0, [srcq+ssq*0] + punpcklbw xm2, xm0 + pmaddubsw xm1, xm4 + pmaddubsw xm2, xm4 + pmulhrsw xm1, xm5 + pmulhrsw xm2, xm5 + packuswb xm1, xm2 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + movu xm0, [srcq+ssq*0] +.v_w16_loop: + vbroadcasti128 m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd m2, m3, m0, 0x0f ; 0 1 + vbroadcasti128 m0, [srcq+ssq*0] + vpblendd m3, m0, 0xf0 ; 1 2 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: +%macro PUT_BILIN_V_W32 0 + movu m0, [srcq+ssq*0] +%%loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m1, m0, m3 + punpckhbw m2, m0, m3 + movu m0, [srcq+ssq*0] + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + pmaddubsw m2, m4 + pmaddubsw m3, m4 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + packuswb m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg %%loop +%endmacro + PUT_BILIN_V_W32 + RET +.v_w64: + movu m0, [srcq+32*0] + movu m1, [srcq+32*1] +.v_w64_loop: + add srcq, ssq + movu m3, [srcq+32*0] + punpcklbw m2, m0, m3 + punpckhbw m0, m3 + pmaddubsw m2, m4 + pmaddubsw m0, m4 + pmulhrsw m2, m5 + pmulhrsw m0, m5 + packuswb m2, m0 + mova m0, m3 + movu m3, [srcq+32*1] + mova [dstq+32*0], m2 + punpcklbw m2, m1, m3 + punpckhbw m1, m3 + pmaddubsw m2, m4 + pmaddubsw m1, m4 + pmulhrsw m2, m5 + pmulhrsw m1, m5 + packuswb m2, m1 + mova m1, m3 + mova [dstq+32*1], m2 + add dstq, dsq + dec hd + jg .v_w64_loop + RET +.v_w128: + lea r6d, [hq+(3<<8)] + mov r4, srcq + mov r7, dstq +.v_w128_loop: + PUT_BILIN_V_W32 + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w128_loop + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 + ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 + movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] + WIN64_SPILL_XMM 8 + shl mxyd, 11 ; can't shift by 12 due to signed overflow + vpbroadcastd m7, [pw_15] + movd xm6, mxyd + add wq, r7 + paddb m5, m5 + vpbroadcastw m6, xm6 + jmp wq +.hv_w2: + vpbroadcastd xm0, [srcq+ssq*0] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 +.hv_w2_loop: + movd xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pinsrd xm1, [srcq+ssq*0], 1 + pshufb xm1, xm4 + pmaddubsw xm1, xm5 ; 1 _ 2 _ + shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ + mova xm0, xm1 + psubw xm1, xm2 + pmulhw xm1, xm6 + pavgw xm2, xm7 + paddw xm1, xm2 + psrlw xm1, 4 + packuswb xm1, xm1 + pextrw [dstq+dsq*0], xm1, 0 + pextrw [dstq+dsq*1], xm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova xm4, [bilin_h_shuf4] + movddup xm0, [srcq+ssq*0] + pshufb xm0, xm4 + pmaddubsw xm0, xm5 +.hv_w4_loop: + movq xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm1, [srcq+ssq*0] + pshufb xm1, xm4 + pmaddubsw xm1, xm5 ; 1 2 + shufps xm2, xm0, xm1, q1032 ; 0 1 + mova xm0, xm1 + psubw xm1, xm2 + pmulhw xm1, xm6 + pavgw xm2, xm7 + paddw xm1, xm2 + psrlw xm1, 4 + packuswb xm1, xm1 + movd [dstq+dsq*0], xm1 + pextrd [dstq+dsq*1], xm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti128 m0, [srcq+ssq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 m1, [srcq+ssq*0], 1 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 + vperm2i128 m2, m0, m1, 0x21 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhw m1, m6 + pavgw m2, m7 + paddw m1, m2 + psrlw m1, 4 + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + movu m0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*0+8*1], 1 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w16_loop: + movu xm2, [srcq+ssq*1+8*0] + vinserti128 m2, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + movu xm3, [srcq+ssq*0+8*0] + vinserti128 m3, [srcq+ssq*0+8*1], 1 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m2, m5 + psubw m1, m2, m0 + pmulhw m1, m6 + pavgw m0, m7 + paddw m1, m0 + pmaddubsw m0, m3, m5 + psubw m3, m0, m2 + pmulhw m3, m6 + pavgw m2, m7 + paddw m3, m2 + psrlw m1, 4 + psrlw m3, 4 + packuswb m1, m3 + vpermq m1, m1, q3120 + mova [dstq+dsq*0], xm1 + vextracti128 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w128: + lea r6d, [hq+(3<<16)] + jmp .hv_w32_start +.hv_w64: + lea r6d, [hq+(1<<16)] +.hv_w32_start: + mov r4, srcq + mov r7, dstq +.hv_w32: +%if WIN64 + movaps r4m, xmm8 +%endif +.hv_w32_loop0: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w32_loop: + add srcq, ssq + movu m2, [srcq+8*0] + movu m3, [srcq+8*1] + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + psubw m8, m2, m0 + pmulhw m8, m6 + pavgw m0, m7 + paddw m8, m0 + mova m0, m2 + psubw m2, m3, m1 + pmulhw m2, m6 + pavgw m1, m7 + paddw m2, m1 + mova m1, m3 + psrlw m8, 4 + psrlw m2, 4 + packuswb m8, m2 + mova [dstq], m8 + add dstq, dsq + dec hd + jg .hv_w32_loop + add r4, 32 + add r7, 32 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<16 + jg .hv_w32_loop0 +%if WIN64 + movaps xmm8, r4m +%endif + RET + +cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea r6, [prep%+SUFFIX] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [r6+wq*2+table_offset(prep,)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movd xm0, [srcq+strideq*0] + pinsrd xm0, [srcq+strideq*1], 1 + pinsrd xm0, [srcq+strideq*2], 2 + pinsrd xm0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pmovzxbw m0, xm0 + psllw m0, 4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + movq xm1, [srcq+strideq*2] + movhps xm1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmovzxbw m0, xm0 + pmovzxbw m1, xm1 + psllw m0, 4 + psllw m1, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + pmovzxbw m0, [srcq+strideq*0] + pmovzxbw m1, [srcq+strideq*1] + pmovzxbw m2, [srcq+strideq*2] + pmovzxbw m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmovzxbw m0, [srcq+strideq*0+16*0] + pmovzxbw m1, [srcq+strideq*0+16*1] + pmovzxbw m2, [srcq+strideq*1+16*0] + pmovzxbw m3, [srcq+strideq*1+16*1] + lea srcq, [srcq+strideq*2] + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 2 + jg .prep_w32 + RET +.prep_w64: + pmovzxbw m0, [srcq+16*0] + pmovzxbw m1, [srcq+16*1] + pmovzxbw m2, [srcq+16*2] + pmovzxbw m3, [srcq+16*3] + add srcq, strideq + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + dec hd + jg .prep_w64 + RET +.prep_w128: + pmovzxbw m0, [srcq+16*0] + pmovzxbw m1, [srcq+16*1] + pmovzxbw m2, [srcq+16*2] + pmovzxbw m3, [srcq+16*3] + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + pmovzxbw m0, [srcq+16*4] + pmovzxbw m1, [srcq+16*5] + pmovzxbw m2, [srcq+16*6] + pmovzxbw m3, [srcq+16*7] + add tmpq, 32*8 + add srcq, strideq + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq-32*4], m0 + mova [tmpq-32*3], m1 + mova [tmpq-32*2], m2 + mova [tmpq-32*1], m3 + dec hd + jg .prep_w128 + RET +.h: + ; 16 * src[x] + (mx * (src[x + 1] - src[x])) + ; = (16 - mx) * src[x] + mx * src[x + 1] + imul mxyd, 255 + vbroadcasti128 m4, [bilin_h_shuf8] + add mxyd, 16 + movd xm5, mxyd + mov mxyd, r6m ; my + vpbroadcastw m5, xm5 + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + vbroadcasti128 m4, [bilin_h_shuf4] +.h_w4_loop: + movq xm0, [srcq+strideq*0] + movhps xm0, [srcq+strideq*1] + movq xm1, [srcq+strideq*2] + movhps xm1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 m0, xm1, 1 + pshufb m0, m4 + pmaddubsw m0, m5 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: +.h_w8_loop: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + movu xm1, [srcq+strideq*2] + vinserti128 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + add tmpq, 32*2 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: +.h_w16_loop: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + movu xm1, [srcq+strideq*1+8*0] + vinserti128 m1, [srcq+strideq*1+8*1], 1 + movu xm2, [srcq+strideq*2+8*0] + vinserti128 m2, [srcq+strideq*2+8*1], 1 + movu xm3, [srcq+stride3q +8*0] + vinserti128 m3, [srcq+stride3q +8*1], 1 + lea srcq, [srcq+strideq*4] + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 4 + jg .h_w16_loop + RET +.h_w32: +.h_w32_loop: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + movu xm1, [srcq+strideq*0+8*2] + vinserti128 m1, [srcq+strideq*0+8*3], 1 + movu xm2, [srcq+strideq*1+8*0] + vinserti128 m2, [srcq+strideq*1+8*1], 1 + movu xm3, [srcq+strideq*1+8*2] + vinserti128 m3, [srcq+strideq*1+8*3], 1 + lea srcq, [srcq+strideq*2] + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + sub hd, 2 + jg .h_w32_loop + RET +.h_w64: + movu xm0, [srcq+8*0] + vinserti128 m0, [srcq+8*1], 1 + movu xm1, [srcq+8*2] + vinserti128 m1, [srcq+8*3], 1 + movu xm2, [srcq+8*4] + vinserti128 m2, [srcq+8*5], 1 + movu xm3, [srcq+8*6] + vinserti128 m3, [srcq+8*7], 1 + add srcq, strideq + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + add tmpq, 32*4 + dec hd + jg .h_w64 + RET +.h_w128: + movu xm0, [srcq+8*0] + vinserti128 m0, [srcq+8*1], 1 + movu xm1, [srcq+8*2] + vinserti128 m1, [srcq+8*3], 1 + movu xm2, [srcq+8*4] + vinserti128 m2, [srcq+8*5], 1 + movu xm3, [srcq+8*6] + vinserti128 m3, [srcq+8*7], 1 + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+32*0], m0 + mova [tmpq+32*1], m1 + mova [tmpq+32*2], m2 + mova [tmpq+32*3], m3 + movu xm0, [srcq+8* 8] + vinserti128 m0, [srcq+8* 9], 1 + movu xm1, [srcq+8*10] + vinserti128 m1, [srcq+8*11], 1 + movu xm2, [srcq+8*12] + vinserti128 m2, [srcq+8*13], 1 + movu xm3, [srcq+8*14] + vinserti128 m3, [srcq+8*15], 1 + add tmpq, 32*8 + add srcq, strideq + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq-32*4], m0 + mova [tmpq-32*3], m1 + mova [tmpq-32*2], m2 + mova [tmpq-32*1], m3 + dec hd + jg .h_w128 + RET +.v: + WIN64_SPILL_XMM 7 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] + imul mxyd, 255 + add mxyd, 16 + add wq, r6 + lea stride3q, [strideq*3] + movd xm6, mxyd + vpbroadcastw m6, xm6 + jmp wq +.v_w4: + movd xm0, [srcq+strideq*0] +.v_w4_loop: + vpbroadcastd m1, [srcq+strideq*2] + vpbroadcastd xm2, [srcq+strideq*1] + vpbroadcastd m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m0, 0x05 ; 0 2 2 2 + vpbroadcastd m0, [srcq+strideq*0] + vpblendd m3, m2, 0x0f ; 1 1 3 3 + vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4 + vpblendd m1, m3, 0xaa ; 0 1 2 3 + vpblendd m2, m3, 0x55 ; 1 2 3 4 + punpcklbw m1, m2 + pmaddubsw m1, m6 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq xm0, [srcq+strideq*0] +.v_w8_loop: + vpbroadcastq m1, [srcq+strideq*2] + vpbroadcastq m2, [srcq+strideq*1] + vpbroadcastq m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m0, 0x03 ; 0 2 2 2 + vpbroadcastq m0, [srcq+strideq*0] + vpblendd m2, m3, 0xcc ; 1 3 1 3 + vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2 + vpblendd m2, m1, 0x0f ; 0 2 1 3 + vpblendd m3, m0, 0xc0 ; 1 3 2 4 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m1, m6 + pmaddubsw m2, m6 + mova [tmpq+32*0], m1 + mova [tmpq+32*1], m2 + add tmpq, 32*2 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + vbroadcasti128 m0, [srcq+strideq*0] +.v_w16_loop: + vbroadcasti128 m1, [srcq+strideq*1] + vbroadcasti128 m2, [srcq+strideq*2] + vbroadcasti128 m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + shufpd m4, m0, m2, 0x0c ; 0 2 + vbroadcasti128 m0, [srcq+strideq*0] + shufpd m1, m3, 0x0c ; 1 3 + shufpd m2, m0, 0x0c ; 2 4 + punpcklbw m3, m4, m1 + punpcklbw m5, m1, m2 + punpckhbw m4, m1 + punpckhbw m1, m2 + pmaddubsw m3, m6 + pmaddubsw m5, m6 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + mova [tmpq+32*0], m3 + mova [tmpq+32*1], m5 + mova [tmpq+32*2], m4 + mova [tmpq+32*3], m1 + add tmpq, 32*4 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + vpermq m0, [srcq+strideq*0], q3120 +.v_w32_loop: + vpermq m1, [srcq+strideq*1], q3120 + vpermq m2, [srcq+strideq*2], q3120 + vpermq m3, [srcq+stride3q ], q3120 + lea srcq, [srcq+strideq*4] + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + vpermq m0, [srcq+strideq*0], q3120 + pmaddubsw m4, m6 + pmaddubsw m5, m6 + mova [tmpq+32*0], m4 + mova [tmpq+32*1], m5 + punpcklbw m4, m1, m2 + punpckhbw m1, m2 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + punpcklbw m5, m2, m3 + punpckhbw m2, m3 + pmaddubsw m5, m6 + pmaddubsw m2, m6 + mova [tmpq+32*2], m4 + mova [tmpq+32*3], m1 + add tmpq, 32*8 + punpcklbw m1, m3, m0 + punpckhbw m3, m0 + pmaddubsw m1, m6 + pmaddubsw m3, m6 + mova [tmpq-32*4], m5 + mova [tmpq-32*3], m2 + mova [tmpq-32*2], m1 + mova [tmpq-32*1], m3 + sub hd, 4 + jg .v_w32_loop + RET +.v_w64: + vpermq m0, [srcq+strideq*0+32*0], q3120 + vpermq m1, [srcq+strideq*0+32*1], q3120 +.v_w64_loop: + vpermq m2, [srcq+strideq*1+32*0], q3120 + vpermq m3, [srcq+strideq*1+32*1], q3120 + lea srcq, [srcq+strideq*2] + punpcklbw m4, m0, m2 + punpckhbw m0, m2 + pmaddubsw m4, m6 + pmaddubsw m0, m6 + mova [tmpq+32*0], m4 + mova [tmpq+32*1], m0 + punpcklbw m4, m1, m3 + punpckhbw m5, m1, m3 + vpermq m0, [srcq+strideq*0+32*0], q3120 + vpermq m1, [srcq+strideq*0+32*1], q3120 + pmaddubsw m4, m6 + pmaddubsw m5, m6 + mova [tmpq+32*2], m4 + mova [tmpq+32*3], m5 + add tmpq, 32*8 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 + punpcklbw m5, m3, m1 + punpckhbw m3, m1 + pmaddubsw m4, m6 + pmaddubsw m2, m6 + pmaddubsw m5, m6 + pmaddubsw m3, m6 + mova [tmpq-32*4], m4 + mova [tmpq-32*3], m2 + mova [tmpq-32*2], m5 + mova [tmpq-32*1], m3 + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + lea r6d, [hq+(3<<8)] + mov r3, srcq + mov r5, tmpq +.v_w128_loop0: + vpermq m0, [srcq+strideq*0], q3120 +.v_w128_loop: + vpermq m1, [srcq+strideq*1], q3120 + lea srcq, [srcq+strideq*2] + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vpermq m0, [srcq+strideq*0], q3120 + pmaddubsw m2, m6 + pmaddubsw m3, m6 + punpcklbw m4, m1, m0 + punpckhbw m1, m0 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + mova [tmpq+32*0], m2 + mova [tmpq+32*1], m3 + mova [tmpq+32*8], m4 + mova [tmpq+32*9], m1 + add tmpq, 32*16 + sub hd, 2 + jg .v_w128_loop + add r3, 32 + add r5, 64 + movzx hd, r6b + mov srcq, r3 + mov tmpq, r5 + sub r6d, 1<<8 + jg .v_w128_loop0 + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 + ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 7 + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + movd xm6, mxyd + vpbroadcastw m6, xm6 + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.hv_w4: + vbroadcasti128 m4, [bilin_h_shuf4] + vpbroadcastq m0, [srcq+strideq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w4_loop: + movq xm1, [srcq+strideq*1] + movhps xm1, [srcq+strideq*2] + movq xm2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + movhps xm2, [srcq+strideq*0] + vinserti128 m1, xm2, 1 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 3 4 + vpblendd m2, m1, m0, 0xc0 + vpermq m2, m2, q2103 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti128 m0, [srcq+strideq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu xm1, [srcq+strideq*1] + vinserti128 m1, [srcq+strideq*2], 1 + movu xm2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 m2, [srcq+strideq*0], 1 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 ; 1 2 + vperm2i128 m3, m0, m1, 0x21 ; 0 1 + pmaddubsw m0, m2, m5 ; 3 4 + vperm2i128 m2, m1, m0, 0x21 ; 2 3 + psubw m1, m3 + pmulhrsw m1, m6 + paddw m1, m3 + psubw m3, m0, m2 + pmulhrsw m3, m6 + paddw m3, m2 + mova [tmpq+32*0], m1 + mova [tmpq+32*1], m3 + add tmpq, 32*2 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w16_loop: + movu xm1, [srcq+strideq*1+8*0] + vinserti128 m1, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + movu xm2, [srcq+strideq*0+8*0] + vinserti128 m2, [srcq+strideq*0+8*1], 1 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + psubw m3, m1, m0 + pmulhrsw m3, m6 + paddw m3, m0 + pmaddubsw m0, m2, m5 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+32*0], m3 + mova [tmpq+32*1], m2 + add tmpq, 32*2 + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + movu xm0, [srcq+8*0] + vinserti128 m0, [srcq+8*1], 1 + movu xm1, [srcq+8*2] + vinserti128 m1, [srcq+8*3], 1 + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w32_loop: + add srcq, strideq + movu xm2, [srcq+8*0] + vinserti128 m2, [srcq+8*1], 1 + pshufb m2, m4 + pmaddubsw m2, m5 + psubw m3, m2, m0 + pmulhrsw m3, m6 + paddw m3, m0 + mova m0, m2 + movu xm2, [srcq+8*2] + vinserti128 m2, [srcq+8*3], 1 + pshufb m2, m4 + pmaddubsw m2, m5 + mova [tmpq+32*0], m3 + psubw m3, m2, m1 + pmulhrsw m3, m6 + paddw m3, m1 + mova m1, m2 + mova [tmpq+32*1], m3 + add tmpq, 32*2 + dec hd + jg .hv_w32_loop + RET +.hv_w128: + lea r3d, [hq+(7<<8)] + mov r6d, 256 + jmp .hv_w64_start +.hv_w64: + lea r3d, [hq+(3<<8)] + mov r6d, 128 +.hv_w64_start: +%if WIN64 + PUSH r7 +%endif + mov r5, srcq + mov r7, tmpq +.hv_w64_loop0: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w64_loop: + movu xm1, [srcq+strideq*1+8*0] + vinserti128 m1, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + movu xm2, [srcq+strideq*0+8*0] + vinserti128 m2, [srcq+strideq*0+8*1], 1 + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + psubw m3, m1, m0 + pmulhrsw m3, m6 + paddw m3, m0 + pmaddubsw m0, m2, m5 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+r6*0], m3 + mova [tmpq+r6*1], m2 + lea tmpq, [tmpq+r6*2] + sub hd, 2 + jg .hv_w64_loop + add r5, 16 + add r7, 32 + movzx hd, r3b + mov srcq, r5 + mov tmpq, r7 + sub r3d, 1<<8 + jg .hv_w64_loop0 +%if WIN64 + POP r7 +%endif + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; fn, type, type_h, type_v +cglobal %1_%2_8bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) +%endif +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%else +DECLARE_REG_TMP 7, 8 +%endif + +%define PUT_8TAP_FN FN put_8tap, +PUT_8TAP_FN sharp, SHARP, SHARP +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN regular, REGULAR, REGULAR + +cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx2] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 + lea r6, [ssq*3] + lea r7, [dsq*3] +%if WIN64 + pop r8 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) + WIN64_SPILL_XMM 11 + cmp wd, 4 + jl .h_w2 + vbroadcasti128 m6, [subpel_h_shufA] + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m7, [subpel_h_shufB] + vbroadcasti128 m8, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] + vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] + add wq, r8 + jmp wq +.h_w2: + movzx mxd, mxb + dec srcq + mova xm4, [subpel_h_shuf4] + vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] +.h_w2_loop: + movq xm0, [srcq+ssq*0] + movhps xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm4 + pmaddubsw xm0, xm3 + phaddw xm0, xm0 + paddw xm0, xm5 + psraw xm0, 6 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] +.h_w4_loop: + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xm0, xm6 + pshufb xm1, xm6 + pmaddubsw xm0, xm3 + pmaddubsw xm1, xm3 + phaddw xm0, xm1 + paddw xm0, xm5 + psraw xm0, 6 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: +%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] + pshufb m%2, m%1, m7 + pshufb m%3, m%1, m8 + pshufb m%1, m6 + pmaddubsw m%4, m%2, m9 + pmaddubsw m%2, m10 + pmaddubsw m%3, m10 + pmaddubsw m%1, m9 + paddw m%3, m%4 + paddw m%1, m%2 + phaddw m%1, m%3 + paddw m%1, m5 + psraw m%1, 6 +%endmacro + movu xm0, [srcq+ssq*0] + vinserti128 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+ssq*0+8*0] + vinserti128 m0, [srcq+ssq*1+8*0], 1 + movu xm1, [srcq+ssq*0+8*1] + vinserti128 m1, [srcq+ssq*1+8*1], 1 + PUT_8TAP_H 0, 2, 3, 4 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + sub dstq, r6 + mov r4, r6 +.h_loop: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 2, 3, 4 + packuswb m0, m1 + mova [dstq+r6], m0 + add r6, 32 + jle .h_loop + add srcq, ssq + add dstq, dsq + mov r6, r4 + dec hd + jg .h_loop + RET +.v: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] + vpbroadcastd m7, [pw_512] + lea myq, [r8+myq*8+subpel_filters-put_avx2] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + add r6, r8 + lea ss3q, [ssq*3] + sub srcq, ss3q + jmp r6 +.v_w2: + movd xm2, [srcq+ssq*0] + pinsrw xm2, [srcq+ssq*1], 2 + pinsrw xm2, [srcq+ssq*2], 4 + add srcq, ss3q + pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3 + movd xm3, [srcq+ssq*1] + vpbroadcastd xm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm3, xm1, 0x02 ; 4 5 + vpblendd xm1, xm0, 0x02 ; 5 6 + palignr xm4, xm3, xm2, 4 ; 1 2 3 4 + punpcklbw xm3, xm1 ; 45 56 + punpcklbw xm1, xm2, xm4 ; 01 12 + punpckhbw xm2, xm4 ; 23 34 +.v_w2_loop: + pmaddubsw xm5, xm1, xm8 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm9 ; a1 b1 + paddw xm5, xm2 + mova xm2, xm3 + pmaddubsw xm3, xm10 ; a2 b2 + paddw xm5, xm3 + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 7 8 + punpcklbw xm3, xm4 ; 67 78 + pmaddubsw xm4, xm3, xm11 ; a3 b3 + paddw xm5, xm4 + pmulhrsw xm5, xm7 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xm2, [srcq+ssq*0] + pinsrd xm2, [srcq+ssq*1], 1 + pinsrd xm2, [srcq+ssq*2], 2 + add srcq, ss3q + pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xm3, [srcq+ssq*1] + vpbroadcastd xm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm3, xm1, 0x02 ; 4 5 + vpblendd xm1, xm0, 0x02 ; 5 6 + palignr xm4, xm3, xm2, 4 ; 1 2 3 4 + punpcklbw xm3, xm1 ; 45 56 + punpcklbw xm1, xm2, xm4 ; 01 12 + punpckhbw xm2, xm4 ; 23 34 +.v_w4_loop: + pmaddubsw xm5, xm1, xm8 ; a0 b0 + mova xm1, xm2 + pmaddubsw xm2, xm9 ; a1 b1 + paddw xm5, xm2 + mova xm2, xm3 + pmaddubsw xm3, xm10 ; a2 b2 + paddw xm5, xm3 + vpbroadcastd xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xm3, xm0, xm4, 0x02 ; 6 7 + vpbroadcastd xm0, [srcq+ssq*0] + vpblendd xm4, xm0, 0x02 ; 7 8 + punpcklbw xm3, xm4 ; 67 78 + pmaddubsw xm4, xm3, xm11 ; a3 b3 + paddw xm5, xm4 + pmulhrsw xm5, xm7 + packuswb xm5, xm5 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + vpbroadcastq m2, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m5, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m6, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m1, m4, 0x30 + vpblendd m4, m2, 0x30 + punpcklbw m1, m4 ; 01 12 + vpblendd m2, m5, 0x30 + vpblendd m5, m3, 0x30 + punpcklbw m2, m5 ; 23 34 + vpblendd m3, m6, 0x30 + vpblendd m6, m0, 0x30 + punpcklbw m3, m6 ; 45 56 +.v_w8_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m5, m1, m8 ; a0 b0 + mova m1, m2 + pmaddubsw m2, m9 ; a1 b1 + paddw m5, m2 + mova m2, m3 + pmaddubsw m3, m10 ; a2 b2 + paddw m5, m3 + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+ssq*0] + vpblendd m4, m0, 0x30 + punpcklbw m3, m4 ; 67 78 + pmaddubsw m4, m3, m11 ; a3 b3 + paddw m5, m4 + pmulhrsw m5, m7 + vextracti128 xm4, m5, 1 + packuswb xm5, xm4 + movq [dstq+dsq*0], xm5 + movhps [dstq+dsq*1], xm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: +.v_w32: +.v_w64: +.v_w128: + lea r6d, [wq*8-128] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*2] +.v_w16_loop0: + vbroadcasti128 m4, [srcq+ssq*0] + vbroadcasti128 m5, [srcq+ssq*1] + vbroadcasti128 m6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m0, [srcq+ssq*0] + vbroadcasti128 m1, [srcq+ssq*1] + vbroadcasti128 m2, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m3, [srcq+ssq*0] + shufpd m4, m0, 0x0c + shufpd m5, m1, 0x0c + punpcklbw m1, m4, m5 ; 01 + punpckhbw m4, m5 ; 34 + shufpd m6, m2, 0x0c + punpcklbw m2, m5, m6 ; 12 + punpckhbw m5, m6 ; 45 + shufpd m0, m3, 0x0c + punpcklbw m3, m6, m0 ; 23 + punpckhbw m6, m0 ; 56 +.v_w16_loop: + vbroadcasti128 m12, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti128 m13, [srcq+ssq*0] + pmaddubsw m14, m1, m8 ; a0 + pmaddubsw m15, m2, m8 ; b0 + mova m1, m3 + mova m2, m4 + pmaddubsw m3, m9 ; a1 + pmaddubsw m4, m9 ; b1 + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + mova m4, m6 + pmaddubsw m5, m10 ; a2 + pmaddubsw m6, m10 ; b2 + paddw m14, m5 + paddw m15, m6 + shufpd m6, m0, m12, 0x0d + shufpd m0, m12, m13, 0x0c + punpcklbw m5, m6, m0 ; 67 + punpckhbw m6, m0 ; 78 + pmaddubsw m12, m5, m11 ; a3 + pmaddubsw m13, m6, m11 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + packuswb m14, m15 + vpermq m14, m14, q3120 + mova [dstq+dsq*0], xm14 + vextracti128 [dstq+dsq*1], m14, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] + lea ss3q, [ssq*3] + sub srcq, ss3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + vpbroadcastd m8, [pw_8192] + vpbroadcastd m9, [pd_512] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 m6, [subpel_h_shuf4] + movq xm2, [srcq+ssq*0] + movhps xm2, [srcq+ssq*1] + movq xm0, [srcq+ssq*2] + add srcq, ss3q + movhps xm0, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpbroadcastq m4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1, [srcq+ssq*0] + vpblendd m2, m3, 0x30 + vpblendd m0, m1, 0x30 + vpblendd m2, m4, 0xc0 + pshufb m2, m6 + pshufb m0, m6 + pmaddubsw m2, m7 + pmaddubsw m0, m7 + phaddw m2, m0 + pmulhrsw m2, m8 + vextracti128 xm3, m2, 1 + palignr xm4, xm3, xm2, 4 + punpcklwd xm1, xm2, xm4 ; 01 12 + punpckhwd xm2, xm4 ; 23 34 + pshufd xm0, xm3, q2121 + punpcklwd xm3, xm0 ; 45 56 +.hv_w2_loop: + movq xm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xm4, [srcq+ssq*0] + pshufb xm4, xm6 + pmaddubsw xm4, xm7 + pmaddwd xm5, xm1, xm10 ; a0 b0 + mova xm1, xm2 + pmaddwd xm2, xm11 ; a1 b1 + paddd xm5, xm2 + mova xm2, xm3 + pmaddwd xm3, xm12 ; a2 b2 + phaddw xm4, xm4 + pmulhrsw xm4, xm8 + paddd xm5, xm3 + palignr xm3, xm4, xm0, 12 + mova xm0, xm4 + punpcklwd xm3, xm0 ; 67 78 + pmaddwd xm4, xm3, xm13 ; a3 b3 + paddd xm5, xm9 + paddd xm5, xm4 + psrad xm5, 10 + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova m6, [subpel_h_shuf4] + vpbroadcastq m2, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + vpbroadcastq m0, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m5, [srcq+ssq*0] + vpbroadcastq m3, [srcq+ssq*1] + vpblendd m2, m4, 0xcc ; 0 1 + vpbroadcastq m4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq m1, [srcq+ssq*0] + vpblendd m0, m5, 0xcc ; 2 3 + vpblendd m3, m4, 0xcc ; 4 5 + pshufb m2, m6 + pshufb m0, m6 + pshufb m3, m6 + pshufb m1, m6 + pmaddubsw m2, m7 + pmaddubsw m0, m7 + pmaddubsw m3, m7 + pmaddubsw m1, m7 + phaddw m2, m0 + phaddw m3, m1 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + palignr m4, m3, m2, 4 + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + pshufd m0, m3, q2121 + punpcklwd m3, m0 ; 45 56 +.hv_w4_loop: + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m1, m10 ; a0 b0 + mova m1, m2 + pmaddwd m2, m11 ; a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, m12 ; a2 b2 + paddd m5, m3 + vpbroadcastq m3, [srcq+ssq*0] + vpblendd m4, m3, 0xcc ; 7 8 + pshufb m4, m6 + pmaddubsw m4, m7 + phaddw m4, m4 + pmulhrsw m4, m8 + palignr m3, m4, m0, 12 + mova m0, m4 + punpcklwd m3, m0 ; 67 78 + pmaddwd m4, m3, m13 ; a3 b3 + paddd m5, m9 + paddd m5, m4 + psrad m5, 10 + vextracti128 xm4, m5, 1 + packssdw xm5, xm4 + packuswb xm5, xm5 + pshuflw xm5, xm5, q3120 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] + vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] + lea ss3q, [ssq*3] + sub srcq, ss3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + lea r6d, [wq*8-64] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*4] +.hv_w8_loop0: + vbroadcasti128 m7, [subpel_h_shufA] + movu xm4, [srcq+ssq*0] + vbroadcasti128 m8, [subpel_h_shufB] + movu xm5, [srcq+ssq*1] + vbroadcasti128 m9, [subpel_h_shufC] + movu xm6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti128 m0, [srcq+ssq*0] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [srcq+ssq*1], 1 ; 1 4 + vinserti128 m6, [srcq+ssq*2], 1 ; 2 5 + add srcq, ss3q + vinserti128 m0, [srcq+ssq*0], 1 ; 3 6 +%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] + pshufb %3, %1, %6 + pshufb %4, %1, %7 + pshufb %1, %5 + pmaddubsw %2, %3, m10 + pmaddubsw %4, m11 + pmaddubsw %3, m11 + pmaddubsw %1, m10 + paddw %2, %4 + paddw %1, %3 + phaddw %1, %2 +%endmacro + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + vpbroadcastd m7, [pw_8192] + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + pmulhrsw m0, m7 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + vpermq m7, m0, q3120 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vextracti128 r6m, m0, 1 ; not enough registers + movu xm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 m0, [srcq+ssq*0], 1 ; 7 8 + pmaddwd m8, m1, m12 ; a0 + pmaddwd m9, m2, m12 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m13 ; a1 + pmaddwd m4, m13 ; b1 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m14 ; a2 + pmaddwd m6, m14 ; b2 + paddd m8, m5 + paddd m9, m6 + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + vbroadcasti128 m5, [subpel_h_shufA] + HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + vpbroadcastd m5, [pw_8192] + vpbroadcastd m7, [pd_512] + vbroadcasti128 m6, r6m + pmulhrsw m0, m5 + paddd m8, m7 + paddd m9, m7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, m15 ; a3 + paddd m8, m7 + pmaddwd m7, m6, m15 ; b3 + paddd m7, m9 + psrad m8, 10 + psrad m7, 10 + packssdw m8, m7 + vextracti128 xm7, m8, 1 + packuswb xm8, xm7 + pshufd xm7, xm8, q3120 + movq [dstq+dsq*0], xm7 + movhps [dstq+dsq*1], xm7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + add r4, 8 + add r7, 8 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +%macro PREP_8TAP_H 0 + pshufb m1, m0, m5 + pshufb m2, m0, m6 + pshufb m3, m0, m7 + pmaddubsw m1, m8 + pmaddubsw m0, m2, m8 + pmaddubsw m2, m9 + pmaddubsw m3, m9 + paddw m1, m2 + paddw m0, m3 + phaddw m0, m1, m0 + pmulhrsw m0, m4 +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep%+SUFFIX] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r7+wq*2+table_offset(prep,)] + add wq, r7 + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pw_8192] + vbroadcasti128 m5, [subpel_h_shufA] + WIN64_SPILL_XMM 10 + cmp wd, 4 + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] + vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] + add wq, r7 + jmp wq +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + lea stride3q, [strideq*3] +.h_w4_loop: + movq xm0, [srcq+strideq*0] + vpbroadcastq m2, [srcq+strideq*2] + movq xm1, [srcq+strideq*1] + vpblendd m0, m2, 0xf0 + vpbroadcastq m2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd m1, m2, 0xf0 + pshufb m0, m5 + pshufb m1, m5 + pmaddubsw m0, m6 + pmaddubsw m1, m6 + phaddw m0, m1 + pmulhrsw m0, m4 + mova [tmpq], m0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+strideq*0] + vinserti128 m0, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + mova [tmpq], m0 + add tmpq, 32 + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu xm0, [srcq+strideq*0+8*0] + vinserti128 m0, [srcq+strideq*0+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+strideq*1+8*0] + vinserti128 m0, [srcq+strideq*1+8*1], 1 + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + sub hd, 2 + jg .h_w16 + RET +.h_w32: + xor r6d, r6d + jmp .h_start +.h_w64: + mov r6, -32*1 + jmp .h_start +.h_w128: + mov r6, -32*3 +.h_start: + sub srcq, r6 + mov r5, r6 +.h_loop: + movu xm0, [srcq+r6+8*0] + vinserti128 m0, [srcq+r6+8*1], 1 + PREP_8TAP_H + mova [tmpq+32*0], m0 + movu xm0, [srcq+r6+8*2] + vinserti128 m0, [srcq+r6+8*3], 1 + PREP_8TAP_H + mova [tmpq+32*1], m0 + add tmpq, 32*2 + add r6, 32 + jle .h_loop + add srcq, strideq + mov r6, r5 + dec hd + jg .h_loop + RET +.v: + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 16 + movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. + shr myd, 16 ; Note that the code is 8-tap only, having + cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 + cmove myd, mxd ; had a negligible effect on performance. + ; TODO: Would a 6-tap code path be worth it? + lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] + lea stride3q, [strideq*3] + sub srcq, stride3q + vpbroadcastd m7, [pw_8192] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + cmp wd, 8 + jg .v_w16 + je .v_w8 +.v_w4: + movd xm0, [srcq+strideq*0] + vpbroadcastd m1, [srcq+strideq*2] + vpbroadcastd xm2, [srcq+strideq*1] + add srcq, stride3q + vpbroadcastd m3, [srcq+strideq*0] + vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ + vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ + vpbroadcastd m0, [srcq+strideq*1] + vpbroadcastd m2, [srcq+strideq*2] + vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ + vpbroadcastd m0, [srcq+stride3q ] + vbroadcasti128 m5, [deint_shuf4] + vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 + vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 + vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ + punpcklbw m1, m2, m3 ; 01 12 23 34 + vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 + punpckhbw m2, m3 ; 23 34 45 56 +.v_w4_loop: + lea srcq, [srcq+strideq*4] + pinsrd xm0, [srcq+strideq*0], 1 + vpbroadcastd m3, [srcq+strideq*1] + vpbroadcastd m4, [srcq+strideq*2] + vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _ + vpbroadcastd m0, [srcq+stride3q ] + vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _ + vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ + pshufb m3, m5 ; 67 78 89 9a + pmaddubsw m4, m1, m8 + vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 + pmaddubsw m2, m9 + paddw m4, m2 + mova m2, m3 + pmaddubsw m3, m11 + paddw m3, m4 + pmaddubsw m4, m1, m10 + paddw m3, m4 + pmulhrsw m3, m7 + mova [tmpq], m3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq xm1, [srcq+strideq*0] + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq m5, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m6, [srcq+strideq*1] + vpbroadcastq m0, [srcq+strideq*2] + vpblendd m1, m4, 0x30 + vpblendd m4, m2, 0x30 + punpcklbw m1, m4 ; 01 12 + vpblendd m2, m5, 0x30 + vpblendd m5, m3, 0x30 + punpcklbw m2, m5 ; 23 34 + vpblendd m3, m6, 0x30 + vpblendd m6, m0, 0x30 + punpcklbw m3, m6 ; 45 56 +.v_w8_loop: + vpbroadcastq m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmaddubsw m5, m2, m9 ; a1 + pmaddubsw m6, m2, m8 ; b0 + vpblendd m2, m0, m4, 0x30 + vpbroadcastq m0, [srcq+strideq*0] + vpblendd m4, m0, 0x30 + punpcklbw m2, m4 ; 67 78 + pmaddubsw m1, m8 ; a0 + pmaddubsw m4, m3, m9 ; b1 + paddw m5, m1 + mova m1, m3 + pmaddubsw m3, m10 ; a2 + paddw m6, m4 + paddw m5, m3 + vpbroadcastq m4, [srcq+strideq*1] + vpblendd m3, m0, m4, 0x30 + vpbroadcastq m0, [srcq+strideq*2] + vpblendd m4, m0, 0x30 + punpcklbw m3, m4 ; 89 9a + pmaddubsw m4, m2, m11 ; a3 + paddw m5, m4 + pmaddubsw m4, m2, m10 ; b2 + paddw m6, m4 + pmaddubsw m4, m3, m11 ; b3 + paddw m6, m4 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + mova [tmpq+32*0], m5 + mova [tmpq+32*1], m6 + add tmpq, 32*2 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + add wd, wd + mov r5, srcq + mov r7, tmpq + lea r6d, [hq+wq*8-256] +.v_w16_loop0: + vbroadcasti128 m4, [srcq+strideq*0] + vbroadcasti128 m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m0, [srcq+strideq*1] + vbroadcasti128 m6, [srcq+strideq*0] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m1, [srcq+strideq*0] + vbroadcasti128 m2, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m3, [srcq+strideq*0] + shufpd m4, m4, m0, 0x0c + shufpd m5, m5, m1, 0x0c + punpcklbw m1, m4, m5 ; 01 + punpckhbw m4, m5 ; 34 + shufpd m6, m6, m2, 0x0c + punpcklbw m2, m5, m6 ; 12 + punpckhbw m5, m6 ; 45 + shufpd m0, m0, m3, 0x0c + punpcklbw m3, m6, m0 ; 23 + punpckhbw m6, m0 ; 56 +.v_w16_loop: + vbroadcasti128 m12, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m13, [srcq+strideq*0] + pmaddubsw m14, m1, m8 ; a0 + pmaddubsw m15, m2, m8 ; b0 + mova m1, m3 + mova m2, m4 + pmaddubsw m3, m9 ; a1 + pmaddubsw m4, m9 ; b1 + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + mova m4, m6 + pmaddubsw m5, m10 ; a2 + pmaddubsw m6, m10 ; b2 + paddw m14, m5 + paddw m15, m6 + shufpd m6, m0, m12, 0x0d + shufpd m0, m12, m13, 0x0c + punpcklbw m5, m6, m0 ; 67 + punpckhbw m6, m0 ; 78 + pmaddubsw m12, m5, m11 ; a3 + pmaddubsw m13, m6, m11 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + mova [tmpq+wq*0], m14 + mova [tmpq+wq*1], m15 + lea tmpq, [tmpq+wq*2] + sub hd, 2 + jg .v_w16_loop + add r5, 16 + add r7, 32 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .v_w16_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + WIN64_SPILL_XMM 16 + cmp wd, 4 + je .hv_w4 + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] + vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] + lea stride3q, [strideq*3] + sub srcq, stride3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + jmp .hv_w8 +.hv_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] + lea stride3q, [strideq*3] + sub srcq, stride3q + mova m7, [subpel_h_shuf4] + pmovzxbd m9, [deint_shuf4] + vpbroadcastd m10, [pw_8192] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + vpbroadcastd m11, [pd_32] + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + vpbroadcastq m2, [srcq+strideq*0] + vpbroadcastq m4, [srcq+strideq*1] + vpbroadcastq m0, [srcq+strideq*2] + vpbroadcastq m5, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m6, [srcq+strideq*1] + vpbroadcastq m1, [srcq+strideq*2] + vpblendd m2, m4, 0xcc ; 0 1 + vpblendd m0, m5, 0xcc ; 2 3 + vpblendd m3, m6, 0xcc ; 4 5 + pshufb m2, m7 ; 00 01 10 11 02 03 12 13 + pshufb m0, m7 ; 20 21 30 31 22 23 32 33 + pshufb m3, m7 ; 40 41 50 51 42 43 52 53 + pshufb m1, m7 ; 60 61 60 61 62 63 62 63 + pmaddubsw m2, m8 + pmaddubsw m0, m8 + pmaddubsw m3, m8 + pmaddubsw m1, m8 + phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b + phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __ + pmulhrsw m2, m10 + pmulhrsw m3, m10 + palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b + punpcklwd m1, m2, m4 ; 01 12 + punpckhwd m2, m4 ; 23 34 + pshufd m0, m3, q2121 + punpcklwd m3, m0 ; 45 56 +.hv_w4_loop: + pmaddwd m5, m1, m12 ; a0 b0 + pmaddwd m6, m2, m12 ; c0 d0 + pmaddwd m2, m13 ; a1 b1 + pmaddwd m4, m3, m13 ; c1 d1 + mova m1, m3 + pmaddwd m3, m14 ; a2 b2 + paddd m5, m2 + vpbroadcastq m2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + paddd m6, m4 + vpbroadcastq m4, [srcq+strideq*0] + paddd m5, m3 + vpbroadcastq m3, [srcq+strideq*1] + vpblendd m2, m4, 0xcc + vpbroadcastq m4, [srcq+strideq*2] + vpblendd m3, m4, 0xcc + pshufb m2, m7 + pshufb m3, m7 + pmaddubsw m2, m8 + pmaddubsw m3, m8 + phaddw m2, m3 + pmulhrsw m2, m10 + palignr m3, m2, m0, 12 + mova m0, m2 + punpcklwd m2, m3, m0 ; 67 78 + punpckhwd m3, m0 ; 89 9a + pmaddwd m4, m2, m14 ; c2 d2 + paddd m6, m11 + paddd m5, m11 + paddd m6, m4 + pmaddwd m4, m2, m15 ; a3 b3 + paddd m5, m4 + pmaddwd m4, m3, m15 ; c3 d3 + paddd m6, m4 + psrad m5, 6 + psrad m6, 6 + packssdw m5, m6 + vpermd m5, m9, m5 + mova [tmpq], m5 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + lea r6d, [wq*8-64] + mov r5, srcq + mov r7, tmpq + lea r6d, [hq+r6*4] +.hv_w8_loop0: + vbroadcasti128 m7, [subpel_h_shufA] + movu xm4, [srcq+strideq*0] + vbroadcasti128 m8, [subpel_h_shufB] + movu xm5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vbroadcasti128 m9, [subpel_h_shufC] + movu xm6, [srcq+strideq*0] + vbroadcasti128 m0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpblendd m4, m0, 0xf0 ; 0 3 + vinserti128 m5, [srcq+strideq*0], 1 ; 1 4 + vinserti128 m6, [srcq+strideq*1], 1 ; 2 5 + lea srcq, [srcq+strideq*2] + vinserti128 m0, [srcq+strideq*0], 1 ; 3 6 + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 + HV_H_W8 m0, m1, m2, m3, m7, m8, m9 + vpbroadcastd m7, [pw_8192] + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + vpermq m6, m6, q3120 + pmulhrsw m0, m7 + pmulhrsw m4, m7 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + vpermq m7, m0, q3120 + punpcklwd m1, m4, m5 ; 01 + punpckhwd m4, m5 ; 34 + punpcklwd m2, m5, m6 ; 12 + punpckhwd m5, m6 ; 45 + punpcklwd m3, m6, m7 ; 23 + punpckhwd m6, m7 ; 56 +.hv_w8_loop: + vextracti128 [tmpq], m0, 1 ; not enough registers + movu xm0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti128 m0, [srcq+strideq*0], 1 ; 7 8 + pmaddwd m8, m1, m12 ; a0 + pmaddwd m9, m2, m12 ; b0 + mova m1, m3 + mova m2, m4 + pmaddwd m3, m13 ; a1 + pmaddwd m4, m13 ; b1 + paddd m8, m3 + paddd m9, m4 + mova m3, m5 + mova m4, m6 + pmaddwd m5, m14 ; a2 + pmaddwd m6, m14 ; b2 + paddd m8, m5 + paddd m9, m6 + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + vbroadcasti128 m5, [subpel_h_shufA] + HV_H_W8 m0, m5, m6, m7, m5, m6, m7 + vpbroadcastd m5, [pw_8192] + vpbroadcastd m7, [pd_32] + vbroadcasti128 m6, [tmpq] + pmulhrsw m0, m5 + paddd m8, m7 + paddd m9, m7 + vpermq m7, m0, q3120 ; 7 8 + shufpd m6, m6, m7, 0x04 ; 6 7 + punpcklwd m5, m6, m7 ; 67 + punpckhwd m6, m7 ; 78 + pmaddwd m7, m5, m15 ; a3 + paddd m8, m7 + pmaddwd m7, m6, m15 ; b3 + paddd m7, m9 + psrad m8, 6 + psrad m7, 6 + packssdw m8, m7 + vpermq m7, m8, q3120 + mova [tmpq+wq*0], xm7 + vextracti128 [tmpq+wq*2], m7, 1 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .hv_w8_loop + add r5, 8 + add r7, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .hv_w8_loop0 + RET + +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro REMAP_REG 2 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %xdefine r14_save r14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + %xdefine r14 r14_save + %undef r14_save + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] + movq xm%1, [srcq+ r4] + movq xm%2, [srcq+ r6] + movhps xm%1, [srcq+ r7] + movhps xm%2, [srcq+ r9] + vinserti128 m%1, [srcq+r10], 1 + vinserti128 m%2, [srcq+r11], 1 + vpbroadcastq m%5, [srcq+r13] + vpbroadcastq m%6, [srcq+ rX] + add srcq, ssq + movq xm%3, [srcq+ r4] + movq xm%4, [srcq+ r6] + movhps xm%3, [srcq+ r7] + movhps xm%4, [srcq+ r9] + vinserti128 m%3, [srcq+r10], 1 + vinserti128 m%4, [srcq+r11], 1 + vpbroadcastq m%7, [srcq+r13] + vpbroadcastq m%8, [srcq+ rX] + add srcq, ssq + vpblendd m%1, m%5, 0xc0 + vpblendd m%2, m%6, 0xc0 + vpblendd m%3, m%7, 0xc0 + vpblendd m%4, m%8, 0xc0 + pmaddubsw m%1, m15 + pmaddubsw m%2, m10 + pmaddubsw m%3, m15 + pmaddubsw m%4, m10 + phaddw m%1, m%2 + phaddw m%3, m%4 + phaddw m%1, m%3 + pmulhrsw m%1, m12 +%endmacro + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isprep 0 +cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy + %xdefine base_reg r12 + %define rndshift 10 +%else + %assign isprep 1 +cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+120] + %xdefine base_reg r11 + %define rndshift 6 +%endif + lea base_reg, [%1_8tap_scaled_8bpc_avx2] +%define base base_reg-%1_8tap_scaled_8bpc_avx2 + tzcnt wd, wm + vpbroadcastd m8, dxm +%if isprep && UNIX64 + movd xm14, mxd + vpbroadcastd m14, xm14 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 +%else + vpbroadcastd m14, mxm +%endif + mov dyd, dym +%ifidn %1, put + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %else + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %endif + %define dsm [rsp+112] + %define rX r1 + %define rXd r1d +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %else + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %define hm [rsp+112] + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define rX r14 + %define rXd r14d +%endif + vpbroadcastd m10, [base+pd_0x3ff] + vpbroadcastd m12, [base+pw_8192] +%ifidn %1, put + vpbroadcastd m13, [base+pd_512] +%else + vpbroadcastd m13, [base+pd_32] +%endif + pxor m9, m9 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0,1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpblendd m15, m7, 0xaa + vpblendd m0, m2, 0xc0 ; 0 1 4 5 + vpblendd m1, m3, 0xc0 ; 2 3 6 7 + pblendvb m15, m11, m8 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 + vextracti128 xm1, m0, 1 ; 4 5 6 7 + palignr xm2, xm1, xm0, 4 ; 1 2 3 4 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + pshufd xm4, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm4 ; 45 56 + punpckhwd xm4, xm1, xm4 ; 67 __ +.w2_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + pmovsxbw xm11, xm11 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + pmaddwd xm8, xm4, xm11 + paddd xm5, xm6 + paddd xm7, xm8 + paddd xm5, xm13 + paddd xm5, xm7 + psrad xm5, 10 + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq], xm5, 0 + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w2_loop + movq xm5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps xm3, xm0, q1032 ; 01 12 + shufps xm0, xm2, q1032 ; 23 34 + shufps xm2, xm4, q1032 ; 45 56 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 + palignr xm1, xm5, xm1, 12 + punpcklqdq xm1, xm1 ; 6 7 6 7 + punpcklwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +.w2_skip_line: + movhps xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xm3, xm0 ; 01 12 + mova xm0, xm2 ; 23 34 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 ; 6 7 6 7 + palignr xm1, xm5, xm1, 8 ; 4 5 6 7 + pshufd xm5, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm5 ; 45 56 + punpckhwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +%endif +.w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd xm15, xm0 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m0, m9 + psrld m14, 10 + movu xm7, [srcq+ssq*0] + movu xm9, [srcq+ssq*1] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm8, [srcq+ssq*2] + movu xm10, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m7, [srcq+ssq*0], 1 + vinserti128 m9, [srcq+ssq*1], 1 + vinserti128 m15, xm15, 1 + vinserti128 m8, [srcq+ssq*2], 1 + vinserti128 m10, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pblendvb m15, m11, m0 + pshufb m7, m14 + pshufb m9, m14 + pshufb m8, m14 + pshufb m10, m14 + pmaddubsw m7, m15 + pmaddubsw m9, m15 + pmaddubsw m8, m15 + pmaddubsw m10, m15 + phaddw m7, m9 + phaddw m8, m10 + pmulhrsw m7, m12 ; 0 1 4 5 + pmulhrsw m8, m12 ; 2 3 6 7 + vextracti128 xm9, m7, 1 ; 4 5 + vextracti128 xm3, m8, 1 ; 6 7 + shufps xm4, xm7, xm8, q1032 ; 1 2 + shufps xm5, xm8, xm9, q1032 ; 3 4 + shufps xm6, xm9, xm3, q1032 ; 5 6 + psrldq xm11, xm3, 8 ; 7 _ + punpcklwd xm0, xm7, xm4 ; 01 + punpckhwd xm7, xm4 ; 12 + punpcklwd xm1, xm8, xm5 ; 23 + punpckhwd xm8, xm5 ; 34 + punpcklwd xm2, xm9, xm6 ; 45 + punpckhwd xm9, xm6 ; 56 + punpcklwd xm3, xm11 ; 67 + mova [rsp+0x00], xm7 + mova [rsp+0x10], xm8 + mova [rsp+0x20], xm9 +.w4_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm10, r6q + pmovsxbw xm10, xm10 + pshufd xm7, xm10, q0000 + pshufd xm8, xm10, q1111 + pshufd xm9, xm10, q2222 + pshufd xm10, xm10, q3333 + pmaddwd xm4, xm0, xm7 + pmaddwd xm5, xm1, xm8 + pmaddwd xm6, xm2, xm9 + pmaddwd xm7, xm3, xm10 + paddd xm4, xm5 + paddd xm6, xm7 + paddd xm4, xm13 + paddd xm4, xm6 + psrad xm4, rndshift + packssdw xm4, xm4 +%ifidn %1, put + packuswb xm4, xm4 + movd [dstq], xm4 + add dstq, dsq +%else + movq [tmpq], xm4 + add tmpq, 8 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + movu xm4, [srcq] + test myd, 0x400 + jz .w4_skip_line + mova xm0, [rsp+0x00] + mova [rsp+0x00], xm1 + mova xm1, [rsp+0x10] + mova [rsp+0x10], xm2 + mova xm2, [rsp+0x20] + mova [rsp+0x20], xm3 + pshufb xm4, xm14 + pmaddubsw xm4, xm15 + phaddw xm4, xm4 + pmulhrsw xm4, xm12 + punpcklwd xm3, xm11, xm4 + mova xm11, xm4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu xm5, [srcq+ssq*1] + movu m6, [rsp+0x10] + pshufb xm4, xm14 + pshufb xm5, xm14 + pmaddubsw xm4, xm15 + pmaddubsw xm5, xm15 + movu [rsp+0x00], m6 + phaddw xm4, xm5 + pmulhrsw xm4, xm12 + punpcklwd xm9, xm11, xm4 + mova [rsp+0x20], xm9 + psrldq xm11, xm4, 8 + mova xm0, xm1 + mova xm1, xm2 + mova xm2, xm3 + punpcklwd xm3, xm4, xm11 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +.w8: + mov dword [rsp+48], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [rsp+48], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [rsp+48], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [rsp+48], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [rsp+48], 16 + movifprep tmp_stridem, 256 +.w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+72], t0d + mov [rsp+56], srcq + mov [rsp+64], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .hloop +.hloop_prep: + dec dword [rsp+48] + jz .ret + add qword [rsp+64], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+16] + vpbroadcastd m15, [rsp+72] + pxor m9, m9 + mov srcq, [rsp+56] + mov r0q, [rsp+64] ; dstq / tmpq +.hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+16], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + mova [rsp], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + mov dyd, dym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklqdq xm11, xm11 + pmovsxbw m11, xm11 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufd m8, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+52], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .skip_line + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .vloop +.skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + vpbroadcastq m7, [srcq+r13] + vpbroadcastq m8, [srcq+ rX] + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + add srcq, ssq + movq xm5, [srcq+ r4] + movq xm6, [srcq+ r6] + movhps xm5, [srcq+ r7] + movhps xm6, [srcq+ r9] + vinserti128 m5, [srcq+r10], 1 + vinserti128 m6, [srcq+r11], 1 + vpbroadcastq m9, [srcq+r13] + vpbroadcastq m11, [srcq+ rX] + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + vpblendd m3, m7, 0xc0 + vpblendd m4, m8, 0xc0 + vpblendd m5, m9, 0xc0 + vpblendd m6, m11, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + pmaddubsw m5, m15 + pmaddubsw m6, m10 + phaddw m3, m4 + phaddw m5, m6 + psrld m4, m3, 16 + pslld m6, m5, 16 + paddw m3, m4 + paddw m5, m6 + pblendw m3, m5, 0xaa + pmulhrsw m3, m12 + jmp .vloop +.dy1: + movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy1_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + add srcq, ss3q + movq xm10, r4q + pmovsxbw xm10, xm10 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + pshufd xm8, xm10, q0000 + pshufd xm9, xm10, q1111 + pshufd xm11, xm10, q3333 + pshufd xm10, xm10, q2222 + vpblendd m0, m2, 0xc0 + pshufb m1, m14 + pshufb m0, m14 + pmaddubsw m1, m15 + pmaddubsw m0, m15 + phaddw m0, m1 + pmulhrsw m0, m12 + vextracti128 xm1, m0, 1 + palignr xm2, xm1, xm0, 4 + pshufd xm4, xm1, q2121 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + punpcklwd xm2, xm1, xm4 ; 45 56 +.dy1_w2_loop: + movq xm1, [srcq+ssq*0] + movhps xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + mova xm3, xm0 + mova xm0, xm2 + paddd xm5, xm13 + paddd xm6, xm7 + pshufb xm1, xm14 + pmaddubsw xm1, xm15 + phaddw xm1, xm1 + pmulhrsw xm1, xm12 + palignr xm7, xm1, xm4, 12 + punpcklwd xm2, xm7, xm1 ; 67 78 + pmaddwd xm7, xm2, xm11 + mova xm4, xm1 + paddd xm5, xm6 + paddd xm5, xm7 + psrad xm5, rndshift + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +.dy1_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + vpermq m8, m8, q3120 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r11d, xm15, 1 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + movu xm2, [srcq+ssq*0] + movu xm3, [srcq+ssq*2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pcmpeqd m8, m9 + psrld m14, 10 + pinsrd xm15, [base+subpel_filters+r11*8+2], 1 + vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 + vinserti128 m2, [srcq+ssq*1], 1 + vinserti128 m3, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + movu xm4, [srcq+ssq*0] + movu xm5, [srcq+ssq*2] + vinserti128 m4, [srcq+ssq*1], 1 + add srcq, ss3q + vpblendd m15, m7, 0x30 + punpcklqdq m15, m15 + pblendvb m15, m11, m8 + movq xm10, r4q + punpcklqdq xm10, xm10 + pmovsxbw m10, xm10 + pshufb m2, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb xm5, xm14 + vpermq m2, m2, q3120 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m2, m3 + phaddw m4, m5 + pmulhrsw m2, m12 + pmulhrsw m4, m12 + palignr m5, m4, m2, 4 + pshufd m3, m4, q2121 + punpcklwd m0, m2, m5 ; 01 12 + punpckhwd m1, m2, m5 ; 23 34 + punpcklwd m2, m4, m3 ; 45 56 +.dy1_w4_loop: + movu xm11, [srcq+ssq*0] + vinserti128 m11, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + mova m0, m1 + mova m1, m2 + paddd m4, m13 + paddd m5, m6 + pshufb m11, m14 + vpermq m11, m11, q3120 + pmaddubsw m11, m15 + phaddw m11, m11 + pmulhrsw m11, m12 + palignr m6, m11, m3, 12 + punpcklwd m2, m6, m11 ; 67 78 + mova m3, m11 + pmaddwd m6, m2, m10 + paddd m4, m5 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + pshuflw xm4, xm4, q3120 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] +%else + pshufd xm4, xm4, q3120 + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy1_w4_loop + MC_8TAP_SCALED_RET +.dy1_w8: + mov dword [rsp+72], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [rsp+72], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+72], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+72], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+72], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+76], t0d + mov [rsp+80], srcq + mov [rsp+88], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + movq xm0, r4q + pmovsxbw xm0, xm0 + mova [rsp+96], xm0 + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [rsp+72] + jz .ret + add qword [rsp+88], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+32] + vpbroadcastd m15, [rsp+76] + pxor m9, m9 + mov srcq, [rsp+80] + mov r0q, [rsp+88] ; dstq / tmpq +.dy1_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+32], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movq [rsp+64], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + movu [rsp], m10 + vpbroadcastd m8, [rsp+0x60] + vpbroadcastd m9, [rsp+0x64] + vpbroadcastd m10, [rsp+0x68] + vpbroadcastd m11, [rsp+0x6c] + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.dy1_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, [rsp] + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_vloop +.dy2: + movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy2_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + vpbroadcastq m2, [srcq+ssq*1] + movhps xm0, [srcq+ssq*2] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + movhps xm1, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vpblendd m0, m2, 0x30 + vpblendd m1, m4, 0xc0 + vpblendd m0, m3, 0xc0 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + movq xm11, r4q + pmovsxbw xm11, xm11 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 01 23 + punpckhwd xm2, xm1 ; 23 45 +.dy2_w2_loop: + movq xm6, [srcq+ssq*0] + vpbroadcastq m7, [srcq+ssq*1] + movhps xm6, [srcq+ssq*2] + vpbroadcastq m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd xm4, xm3, xm8 + pmaddwd xm5, xm2, xm9 + vpblendd m6, m7, 0x30 + vpblendd m6, m1, 0xc0 + pshufb m6, m14 + pmaddubsw m6, m15 + phaddw m6, m6 + pmulhrsw m6, m12 + palignr m0, m6, m0, 8 + pshufd m2, m0, q3221 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 45 67 + punpckhwd xm2, xm1 ; 67 89 + pmaddwd xm6, xm3, xm10 + pmaddwd xm7, xm2, xm11 + paddd xm4, xm5 + paddd xm4, xm13 + paddd xm6, xm7 + paddd xm4, xm6 + psrad xm4, rndshift + packssdw xm4, xm4 + packuswb xm4, xm4 + pextrw [dstq+dsq*0], xm4, 0 + pextrw [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +.dy2_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m8, m9 + psrld m14, 10 + movu xm0, [srcq+ssq*0] + movu xm2, [srcq+ssq*2] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm1, [srcq+ssq*1] + movu xm3, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vinserti128 m15, xm15, 1 + pshufb m14, m5 + paddb m14, m6 + vinserti128 m2, [srcq+ssq*0], 1 + vinserti128 m3, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pblendvb m15, m11, m8 + pshufb xm0, xm14 + pshufb m2, m14 + pshufb xm1, xm14 + pshufb m3, m14 + pmaddubsw xm0, xm15 + pmaddubsw m2, m15 + pmaddubsw xm1, xm15 + pmaddubsw m3, m15 + movq xm11, r4q + punpcklqdq xm11, xm11 + pmovsxbw m11, xm11 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 2 _ 4 + pmulhrsw m1, m12 ; 1 3 _ 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + punpcklwd xm2, xm0, xm1 + punpckhwd m1, m0, m1 ; 23 45 + vinserti128 m0, m2, xm1, 1 ; 01 23 +.dy2_w4_loop: + movu xm6, [srcq+ssq*0] + movu xm7, [srcq+ssq*1] + vinserti128 m6, [srcq+ssq*2], 1 + vinserti128 m7, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufb m6, m14 + pshufb m7, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + psrld m2, m6, 16 + pslld m3, m7, 16 + paddw m6, m2 + paddw m7, m3 + pblendw m6, m7, 0xaa ; 67 89 + pmulhrsw m6, m12 + paddd m4, m5 + vperm2i128 m0, m1, m6, 0x21 ; 45 67 + mova m1, m6 + pmaddwd m6, m0, m10 + pmaddwd m7, m1, m11 + paddd m4, m13 + paddd m6, m7 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET +.dy2_w8: + mov dword [rsp+40], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [rsp+40], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+40], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+40], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+40], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+64], t0d + mov [rsp+48], srcq + mov [rsp+56], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + movq xm0, r4q + pmovsxbw xm0, xm0 + mova [rsp+0x50], xm0 + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [rsp+40] + jz .ret + add qword [rsp+56], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp] + vpbroadcastd m15, [rsp+64] + pxor m9, m9 + mov srcq, [rsp+48] + mov r0q, [rsp+56] ; dstq / tmpq +.dy2_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + vpbroadcastd m8, [rsp+0x50] + vpbroadcastd m9, [rsp+0x54] + vpbroadcastd m11, [rsp+0x58] + vpbroadcastd m4, [rsp+0x5c] + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + SWAP m14, m4 +.dy2_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m14 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep + mova m0, m1 + mova m1, m2 + mova m2, m3 + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + vpbroadcastq m5, [srcq+r13] + vpbroadcastq m6, [srcq+ rX] + add srcq, ssq + vpblendd m3, m5, 0xc0 + vpblendd m4, m6, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + phaddw m3, m4 + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + psrld m5, m3, 16 + pslld m6, m4, 16 + paddw m3, m5 + paddw m4, m6 + pblendw m3, m4, 0xaa + pmulhrsw m3, m12 + jmp .dy2_vloop +.ret: + MC_8TAP_SCALED_RET 0 +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled_8bpc + mov t0d, (5*15 << 16) | 5*15 + mov t1d, t0d + jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%else +DECLARE_REG_TMP 6, 8 +%endif + +%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, +%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, + +BILIN_SCALED_FN put +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +BILIN_SCALED_FN prep +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR +MC_8TAP_SCALED prep + +%macro WARP_V 5 ; dst, 02, 46, 13, 57 + ; Can be done using gathers, but that's terribly slow on many CPU:s + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm8, [filterq+myq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 ; a e + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+deltaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; b f + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq xm9, [filterq+myq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 ; c g + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+gammaq] ; my += gamma + shr tmp2d, 10 + shr tmp1d, 10 + punpcklwd m8, m0 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 ; d h + punpcklwd m0, m9, m0 + punpckldq m9, m8, m0 + punpckhdq m0, m8, m0 + punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 + punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 + pmaddwd m%2, m8 + pmaddwd m9, m%3 + punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 + punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 + pmaddwd m8, m%4 + pmaddwd m0, m%5 + paddd m%2, m9 + paddd m0, m8 + paddd m%1, m0, m%2 +%endmacro + +cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts +%if WIN64 + sub rsp, 0xa0 +%endif + call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main +.loop: + psrad m7, 13 + psrad m0, 13 + packssdw m7, m0 + pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 + vpermq m7, m7, q3120 + mova [tmpq+tsq*0], xm7 + vextracti128 [tmpq+tsq*2], m7, 1 + dec r4d + jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end + call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2 + lea tmpq, [tmpq+tsq*4] + jmp .loop + +cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ + beta, filter, tmp1, delta, my, gamma +%if WIN64 + sub rsp, 0xa0 + %assign xmm_regs_used 16 + %assign stack_size_padded 0xa0 + %assign stack_offset stack_offset+stack_size_padded +%endif + call .main + jmp .start +.loop: + call .main2 + lea dstq, [dstq+dsq*2] +.start: + psrad m7, 18 + psrad m0, 18 + packusdw m7, m0 + pavgw m7, m11 ; (x + (1 << 10)) >> 11 + vextracti128 xm0, m7, 1 + packuswb xm7, xm0 + pshufd xm7, xm7, q3120 + movq [dstq+dsq*0], xm7 + movhps [dstq+dsq*1], xm7 + dec r4d + jg .loop +.end: + RET +ALIGN function_align +.main: + ; Stack args offset by one (r4m -> r5m etc.) due to call +%if WIN64 + mov abcdq, r5m + mov mxd, r6m + movaps [rsp+stack_offset+0x10], xmm6 + movaps [rsp+stack_offset+0x20], xmm7 + movaps [rsp+0x28], xmm8 + movaps [rsp+0x38], xmm9 + movaps [rsp+0x48], xmm10 + movaps [rsp+0x58], xmm11 + movaps [rsp+0x68], xmm12 + movaps [rsp+0x78], xmm13 + movaps [rsp+0x88], xmm14 + movaps [rsp+0x98], xmm15 +%endif + movsx alphad, word [abcdq+2*0] + movsx betad, word [abcdq+2*1] + mova m12, [warp_8x8_shufA] + mova m13, [warp_8x8_shufB] + vpbroadcastd m14, [pw_8192] + vpbroadcastd m15, [pd_32768] + pxor m11, m11 + lea filterq, [mc_warp_filter2] + lea tmp1q, [ssq*3+3] + add mxd, 512+(64<<10) + lea tmp2d, [alphaq*3] + sub srcq, tmp1q ; src -= src_stride*3 + 3 + sub betad, tmp2d ; beta -= alpha*3 + mov myd, r7m + call .h + psrld m1, m0, 16 + call .h + psrld m4, m0, 16 + call .h + pblendw m1, m0, 0xaa ; 02 + call .h + pblendw m4, m0, 0xaa ; 13 + call .h + psrld m2, m1, 16 + pblendw m2, m0, 0xaa ; 24 + call .h + psrld m5, m4, 16 + pblendw m5, m0, 0xaa ; 35 + call .h + psrld m3, m2, 16 + pblendw m3, m0, 0xaa ; 46 + movsx deltad, word [abcdq+2*2] + movsx gammad, word [abcdq+2*3] + add myd, 512+(64<<10) + mov r4d, 4 + lea tmp1d, [deltaq*3] + sub gammad, tmp1d ; gamma -= delta*3 +.main2: + call .h + psrld m6, m5, 16 + pblendw m6, m0, 0xaa ; 57 + WARP_V 7, 1, 3, 4, 6 + call .h + mova m1, m2 + mova m2, m3 + psrld m3, 16 + pblendw m3, m0, 0xaa ; 68 + WARP_V 0, 4, 6, 1, 3 + mova m4, m5 + mova m5, m6 + ret +ALIGN function_align +.h: + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + vbroadcasti128 m10, [srcq] + shr mxd, 10 + shr tmp1d, 10 + movq xm8, [filterq+mxq *8] + vinserti128 m8, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+alphaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + shr mxd, 10 + shr tmp1d, 10 + movq xm9, [filterq+mxq *8] + vinserti128 m9, [filterq+tmp1q*8], 1 + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+betaq] ; mx += beta + shr tmp2d, 10 + shr tmp1d, 10 + punpcklqdq m8, m0 ; 0 1 4 5 + movq xm0, [filterq+tmp2q*8] + vinserti128 m0, [filterq+tmp1q*8], 1 + punpcklqdq m9, m0 ; 2 3 6 7 + pshufb m0, m10, m12 + pmaddubsw m0, m8 + pshufb m10, m13 + pmaddubsw m10, m9 + add srcq, ssq + phaddw m0, m10 + pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 + paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword + ret + +%macro BIDIR_FN 1 ; op + %1 0 + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq ], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + cmp hd, 4 + je .ret + lea dstq, [dstq+strideq*4] + pextrd [dstq ], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + cmp hd, 8 + je .ret + %1 2 + lea dstq, [dstq+strideq*4] + vextracti128 xm1, m0, 1 + movd [dstq ], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq ], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 +.ret: + RET +.w8_loop: + %1_INC_PTR 2 + %1 0 + lea dstq, [dstq+strideq*4] +.w8: + vextracti128 xm1, m0, 1 + movq [dstq ], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + %1_INC_PTR 4 + %1 0 + lea dstq, [dstq+strideq*4] +.w16: + vpermq m0, m0, q3120 + mova [dstq ], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+strideq*2], xm0 + vextracti128 [dstq+stride3q ], m0, 1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + %1_INC_PTR 4 + %1 0 + lea dstq, [dstq+strideq*2] +.w32: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], m0 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+strideq*1], m0 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + %1_INC_PTR 4 + %1 0 + add dstq, strideq +.w64: + vpermq m0, m0, q3120 + mova [dstq], m0 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+32], m0 + dec hd + jg .w64_loop + RET +.w128_loop: + %1 0 + add dstq, strideq +.w128: + vpermq m0, m0, q3120 + mova [dstq+0*32], m0 + %1 2 + vpermq m0, m0, q3120 + mova [dstq+1*32], m0 + %1_INC_PTR 8 + %1 -4 + vpermq m0, m0, q3120 + mova [dstq+2*32], m0 + %1 -2 + vpermq m0, m0, q3120 + mova [dstq+3*32], m0 + dec hd + jg .w128_loop + RET +%endmacro + +%macro AVG 1 ; src_offset + mova m0, [tmp1q+(%1+0)*32] + paddw m0, [tmp2q+(%1+0)*32] + mova m1, [tmp1q+(%1+1)*32] + paddw m1, [tmp2q+(%1+1)*32] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + packuswb m0, m1 +%endmacro + +%macro AVG_INC_PTR 1 + add tmp1q, %1*32 + add tmp2q, %1*32 +%endmacro + +cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg %+ SUFFIX %+ _table + lea r6, [avg %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r6+wq*4] + vpbroadcastd m2, [base+pw_1024] + add wq, r6 + BIDIR_FN AVG + +%macro W_AVG 1 ; src_offset + ; (a * weight + b * (16 - weight) + 128) >> 8 + ; = ((a - b) * weight + (b << 4) + 128) >> 8 + ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 + ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 + mova m0, [tmp1q+(%1+0)*32] + psubw m2, m0, [tmp2q+(%1+0)*32] + mova m1, [tmp1q+(%1+1)*32] + psubw m3, m1, [tmp2q+(%1+1)*32] + pmulhw m2, m4 + pmulhw m3, m4 + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%define W_AVG_INC_PTR AVG_INC_PTR + +cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-w_avg %+ SUFFIX %+ _table + lea r6, [w_avg %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + vpbroadcastw m4, r6m ; weight + movsxd wq, dword [r6+wq*4] + vpbroadcastd m5, [base+pw_2048] + psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 + cmp dword r6m, 7 + jg .weight_gt7 + mov r6, tmp1q + pxor m0, m0 + mov tmp1q, tmp2q + psubw m4, m0, m4 ; -weight + mov tmp2q, r6 +.weight_gt7: + BIDIR_FN W_AVG + +%macro MASK 1 ; src_offset + ; (a * m + b * (64 - m) + 512) >> 10 + ; = ((a - b) * m + (b << 6) + 512) >> 10 + ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 + vpermq m3, [maskq+%1*16], q3120 + mova m0, [tmp2q+(%1+0)*32] + psubw m1, m0, [tmp1q+(%1+0)*32] + psubb m3, m4, m3 + paddw m1, m1 ; (b - a) << 1 + paddb m3, m3 + punpcklbw m2, m4, m3 ; -m << 9 + pmulhw m1, m2 + paddw m0, m1 + mova m1, [tmp2q+(%1+1)*32] + psubw m2, m1, [tmp1q+(%1+1)*32] + paddw m2, m2 + punpckhbw m3, m4, m3 + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%macro MASK_INC_PTR 1 + add maskq, %1*16 + add tmp2q, %1*32 + add tmp1q, %1*32 +%endmacro + +cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask %+ SUFFIX %+ _table + lea r7, [mask %+ SUFFIX %+ _table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp + movsxd wq, dword [r7+wq*4] + vpbroadcastd m5, [base+pw_2048] + pxor m4, m4 + add wq, r7 + BIDIR_FN MASK + +%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 + mova m%1, [tmp1q+32*%3] + mova m1, [tmp2q+32*%3] + psubw m1, m%1 + pabsw m%2, m1 + psubusw m%2, m6, m%2 + psrlw m%2, 8 ; 64 - m + psllw m2, m%2, 10 + pmulhw m1, m2 + paddw m%1, m1 + mova m1, [tmp1q+32*%4] + mova m2, [tmp2q+32*%4] + psubw m2, m1 + pabsw m3, m2 + psubusw m3, m6, m3 + psrlw m3, 8 +%if %5 + packuswb m%2, m3 + psubb m%2, m5, m%2 + vpermq m%2, m%2, q3120 +%else + phaddw m%2, m3 +%endif + psllw m3, 10 + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m%1, m7 + pmulhrsw m1, m7 + packuswb m%1, m1 +%endmacro + +cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_avx2_table + lea r6, [blend_avx2_table] + tzcnt wd, wm + movifnidn maskq, maskmp + movifnidn hd, hm + movsxd wq, dword [r6+wq*4] + vpbroadcastd m4, [base+pb_64] + vpbroadcastd m5, [base+pw_512] + sub tmpq, maskq + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + movd xm0, [dstq+dsq*0] + pinsrd xm0, [dstq+dsq*1], 1 + vpbroadcastd xm1, [dstq+dsq*2] + pinsrd xm1, [dstq+r6 ], 3 + mova xm6, [maskq] + psubb xm3, xm4, xm6 + punpcklbw xm2, xm3, xm6 + punpckhbw xm3, xm6 + mova xm6, [maskq+tmpq] + add maskq, 4*4 + punpcklbw xm0, xm6 + punpckhbw xm1, xm6 + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm3 + pmulhrsw xm0, xm5 + pmulhrsw xm1, xm5 + packuswb xm0, xm1 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + pextrd [dstq+dsq*2], xm0, 2 + pextrd [dstq+r6 ], xm0, 3 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + RET +ALIGN function_align +.w8: + movq xm1, [dstq+dsq*0] + movhps xm1, [dstq+dsq*1] + vpbroadcastq m2, [dstq+dsq*2] + vpbroadcastq m3, [dstq+r6 ] + mova m0, [maskq] + mova m6, [maskq+tmpq] + add maskq, 8*4 + vpblendd m1, m2, 0x30 + vpblendd m1, m3, 0xc0 + psubb m3, m4, m0 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + punpcklbw m0, m1, m6 + punpckhbw m1, m6 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + movq [dstq+dsq*2], xm1 + movhps [dstq+r6 ], xm1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + RET +ALIGN function_align +.w16: + mova m0, [maskq] + mova xm1, [dstq+dsq*0] + vinserti128 m1, [dstq+dsq*1], 1 + psubb m3, m4, m0 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + mova m6, [maskq+tmpq] + add maskq, 16*2 + punpcklbw m0, m1, m6 + punpckhbw m1, m6 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16 + RET +ALIGN function_align +.w32: + mova m0, [maskq] + mova m1, [dstq] + mova m6, [maskq+tmpq] + add maskq, 32 + psubb m3, m4, m0 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + punpcklbw m0, m1, m6 + punpckhbw m1, m6 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .w32 + RET + +cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_v_avx2_table + lea r5, [blend_v_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r5+wq*4] + vpbroadcastd m5, [base+pw_512] + add wq, r5 + add maskq, obmc_masks-blend_v_avx2_table + jmp wq +.w2: + vpbroadcastd xm2, [maskq+2*2] +.w2_s0_loop: + movd xm0, [dstq+dsq*0] + pinsrw xm0, [dstq+dsq*1], 1 + movd xm1, [tmpq] + add tmpq, 2*2 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_s0_loop + RET +ALIGN function_align +.w4: + vpbroadcastq xm2, [maskq+4*2] +.w4_loop: + movd xm0, [dstq+dsq*0] + pinsrd xm0, [dstq+dsq*1], 1 + movq xm1, [tmpq] + add tmpq, 4*2 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +ALIGN function_align +.w8: + mova xm3, [maskq+8*2] +.w8_loop: + movq xm0, [dstq+dsq*0] + vpbroadcastq xm1, [dstq+dsq*1] + mova xm2, [tmpq] + add tmpq, 8*2 + punpcklbw xm0, xm2 + punpckhbw xm1, xm2 + pmaddubsw xm0, xm3 + pmaddubsw xm1, xm3 + pmulhrsw xm0, xm5 + pmulhrsw xm1, xm5 + packuswb xm0, xm1 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +ALIGN function_align +.w16: + vbroadcasti128 m3, [maskq+16*2] + vbroadcasti128 m4, [maskq+16*3] +.w16_loop: + mova xm1, [dstq+dsq*0] + vinserti128 m1, [dstq+dsq*1], 1 + mova m2, [tmpq] + add tmpq, 16*2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +ALIGN function_align +.w32: + mova xm3, [maskq+16*4] + vinserti128 m3, [maskq+16*6], 1 + mova xm4, [maskq+16*5] + vinserti128 m4, [maskq+16*7], 1 +.w32_loop: + mova m1, [dstq] + mova m2, [tmpq] + add tmpq, 32 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .w32_loop + RET + +cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_h_avx2_table + lea r5, [blend_h_avx2_table] + mov r6d, wd + tzcnt wd, wd + mov hd, hm + movsxd wq, dword [r5+wq*4] + vpbroadcastd m5, [base+pw_512] + add wq, r5 + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd xm0, [dstq+dsq*0] + pinsrw xm0, [dstq+dsq*1], 1 + movd xm2, [maskq+hq*2] + movd xm1, [tmpq] + add tmpq, 2*2 + punpcklwd xm2, xm2 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + pextrw [dstq+dsq*0], xm0, 0 + pextrw [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +ALIGN function_align +.w4: + mova xm3, [blend_shuf] +.w4_loop: + movd xm0, [dstq+dsq*0] + pinsrd xm0, [dstq+dsq*1], 1 + movd xm2, [maskq+hq*2] + movq xm1, [tmpq] + add tmpq, 4*2 + pshufb xm2, xm3 + punpcklbw xm0, xm1 + pmaddubsw xm0, xm2 + pmulhrsw xm0, xm5 + packuswb xm0, xm0 + movd [dstq+dsq*0], xm0 + pextrd [dstq+dsq*1], xm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +ALIGN function_align +.w8: + vbroadcasti128 m4, [blend_shuf] + shufpd m4, m4, 0x03 +.w8_loop: + vpbroadcastq m1, [dstq+dsq*0] + movq xm0, [dstq+dsq*1] + vpblendd m0, m1, 0x30 + vpbroadcastd m3, [maskq+hq*2] + movq xm1, [tmpq+8*1] + vinserti128 m1, [tmpq+8*0], 1 + add tmpq, 8*2 + pshufb m3, m4 + punpcklbw m0, m1 + pmaddubsw m0, m3 + pmulhrsw m0, m5 + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + movhps [dstq+dsq*0], xm0 + movq [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + RET +ALIGN function_align +.w16: + vbroadcasti128 m4, [blend_shuf] + shufpd m4, m4, 0x0c +.w16_loop: + mova xm1, [dstq+dsq*0] + vinserti128 m1, [dstq+dsq*1], 1 + vpbroadcastd m3, [maskq+hq*2] + mova m2, [tmpq] + add tmpq, 16*2 + pshufb m3, m4 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16_loop + RET +ALIGN function_align +.w32: ; w32/w64/w128 + sub dsq, r6 +.w32_loop0: + vpbroadcastw m3, [maskq+hq*2] + mov wd, r6d +.w32_loop: + mova m1, [dstq] + mova m2, [tmpq] + add tmpq, 32 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, 32 + sub wd, 32 + jg .w32_loop + add dstq, dsq + inc hq + jl .w32_loop0 + RET + +cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ + bottomext, rightext + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor r12d, r12d + lea r10, [ihq-1] + cmp yq, ihq + cmovs r10, yq + test yq, yq + cmovs r10, r12 + imul r10, sstrideq + add srcq, r10 + + ; ref += iclip(x, 0, iw - 1) + lea r10, [iwq-1] + cmp xq, iwq + cmovs r10, xq + test xq, xq + cmovs r10, r12 + add srcq, r10 + + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) + lea bottomextq, [yq+bhq] + sub bottomextq, ihq + lea r3, [bhq-1] + cmovs bottomextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, r12 + cmp bottomextq, bhq + cmovns bottomextq, r3 + cmp topextq, bhq + cmovg topextq, r3 + + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + lea rightextq, [xq+bwq] + sub rightextq, iwq + lea r2, [bwq-1] + cmovs rightextq, r12 + + DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ + bottomext, rightext + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, r12 + cmp rightextq, bwq + cmovns rightextq, r2 + cmp leftextq, bwq + cmovns leftextq, r2 + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ + dst, dstride, src, sstride, bottomext, rightext + + ; center_h = bh - top_ext - bottom_ext + lea r3, [bottomextq+topextq] + sub centerhq, r3 + + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq + imul r2, dstrideq + add dstq, r2 + mov r9m, dstq + + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq + lea r3, [rightextq+leftextq] + sub centerwq, r3 + +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix +.v_loop_%3: +%if %1 + ; left extension + xor r3, r3 + vpbroadcastb m0, [srcq] +.left_loop_%3: + mova [dstq+r3], m0 + add r3, 32 + cmp r3, leftextq + jl .left_loop_%3 + + ; body + lea r12, [dstq+leftextq] +%endif + xor r3, r3 +.body_loop_%3: + movu m0, [srcq+r3] +%if %1 + movu [r12+r3], m0 +%else + movu [dstq+r3], m0 +%endif + add r3, 32 + cmp r3, centerwq + jl .body_loop_%3 + +%if %2 + ; right extension +%if %1 + add r12, centerwq +%else + lea r12, [dstq+centerwq] +%endif + xor r3, r3 + vpbroadcastb m0, [srcq+centerwq-1] +.right_loop_%3: + movu [r12+r3], m0 + add r3, 32 + cmp r3, rightextq + jl .right_loop_%3 + +%endif + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 +%endmacro + + test leftextq, leftextq + jnz .need_left_ext + test rightextq, rightextq + jnz .need_right_ext + v_loop 0, 0, 0 + jmp .body_done + +.need_left_ext: + test rightextq, rightextq + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: + ; bottom edge extension + test bottomextq, bottomextq + jz .top + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 +.bottom_x_loop: + mova m0, [srcq+r1] + lea r3, [dstq+r1] + mov r4, bottomextq +.bottom_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .bottom_y_loop + add r1, 32 + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end + mov srcq, r9m + mov dstq, dstm + xor r1, r1 +.top_x_loop: + mova m0, [srcq+r1] + lea r3, [dstq+r1] + mov r4, topextq +.top_y_loop: + mova [r3], m0 + add r3, dstrideq + dec r4 + jg .top_y_loop + add r1, 32 + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + vpbroadcastd m5, dxm + vpbroadcastd m8, mx0m + vpbroadcastd m6, src_wm + + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x + LEA r7, $$ +%define base r7-$$ + + vpbroadcastd xm3, [base+pw_m256] + vpbroadcastd m7, [base+pd_63] + vbroadcasti128 m15, [base+pb_8x0_8x8] + pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] + pslld m5, 3 ; dx*8 + pslld m6, 14 + paddd m8, m2 ; mx+[0..7]*dx + pxor m2, m2 + + ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 + ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8 + +.loop_y: + xor xd, xd + mova m4, m8 ; per-line working version of mx + +.loop_x: + pmaxsd m0, m4, m2 + psrad m9, m4, 8 ; filter offset (unmasked) + pminsd m0, m6 ; iclip(mx, 0, src_w-8) + psubd m1, m4, m0 ; pshufb offset + psrad m0, 14 ; clipped src_x offset + psrad m1, 14 ; pshufb edge_emu offset + pand m9, m7 ; filter offset (masked) + + ; load source pixels - this ugly code is vpgatherdq emulation since + ; directly using vpgatherdq on Haswell is quite a bit slower :( + movd r8d, xm0 + pextrd r9d, xm0, 1 + pextrd r10d, xm0, 2 + pextrd r11d, xm0, 3 + vextracti128 xm0, m0, 1 + movq xm12, [srcq+r8] + movq xm13, [srcq+r10] + movhps xm12, [srcq+r9] + movhps xm13, [srcq+r11] + movd r8d, xm0 + pextrd r9d, xm0, 1 + pextrd r10d, xm0, 2 + pextrd r11d, xm0, 3 + vinserti128 m12, [srcq+r8], 1 + vinserti128 m13, [srcq+r10], 1 + vpbroadcastq m10, [srcq+r9] + vpbroadcastq m11, [srcq+r11] + vpblendd m12, m10, 11000000b + vpblendd m13, m11, 11000000b + + ; if no emulation is required, we don't need to shuffle or emulate edges + ; this also saves 2 quasi-vpgatherdqs + vptest m1, m1 + jz .filter + + movq r9, xm1 + pextrq r11, xm1, 1 + movsxd r8, r9d + sar r9, 32 + movsxd r10, r11d + sar r11, 32 + vextracti128 xm1, m1, 1 + movq xm14, [base+resize_shuf+4+r8] + movq xm0, [base+resize_shuf+4+r10] + movhps xm14, [base+resize_shuf+4+r9] + movhps xm0, [base+resize_shuf+4+r11] + movq r9, xm1 + pextrq r11, xm1, 1 + movsxd r8, r9d + sar r9, 32 + movsxd r10, r11d + sar r11, 32 + vinserti128 m14, [base+resize_shuf+4+r8], 1 + vinserti128 m0, [base+resize_shuf+4+r10], 1 + vpbroadcastq m10, [base+resize_shuf+4+r9] + vpbroadcastq m11, [base+resize_shuf+4+r11] + vpblendd m14, m10, 11000000b + vpblendd m0, m11, 11000000b + + paddb m14, m15 + paddb m0, m15 + pshufb m12, m14 + pshufb m13, m0 + +.filter: + movd r8d, xm9 + pextrd r9d, xm9, 1 + pextrd r10d, xm9, 2 + pextrd r11d, xm9, 3 + vextracti128 xm9, m9, 1 + movq xm10, [base+resize_filter+r8*8] + movq xm11, [base+resize_filter+r10*8] + movhps xm10, [base+resize_filter+r9*8] + movhps xm11, [base+resize_filter+r11*8] + movd r8d, xm9 + pextrd r9d, xm9, 1 + pextrd r10d, xm9, 2 + pextrd r11d, xm9, 3 + vinserti128 m10, [base+resize_filter+r8*8], 1 + vinserti128 m11, [base+resize_filter+r10*8], 1 + vpbroadcastq m14, [base+resize_filter+r9*8] + vpbroadcastq m1, [base+resize_filter+r11*8] + vpblendd m10, m14, 11000000b + vpblendd m11, m1, 11000000b + + pmaddubsw m12, m10 + pmaddubsw m13, m11 + phaddw m12, m13 + vextracti128 xm13, m12, 1 + phaddsw xm12, xm13 + pmulhrsw xm12, xm3 ; x=(x+64)>>7 + packuswb xm12, xm12 + movq [dstq+xq], xm12 + + paddd m4, m5 + add xd, 8 + cmp xd, dst_wd + jl .loop_x + + add dstq, dst_strideq + add srcq, src_strideq + dec hd + jg .loop_y + RET + +cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx2_table + lea r7, [w_mask_420_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + movsxd wq, [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + pmovzxbd m9, [base+deint_shuf4] + vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign + add wq, r7 + W_MASK 0, 4, 0, 1 + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + jg .w4_h16 +.w4_end: + vextracti128 xm0, m4, 1 + vpblendd xm1, xm4, xm0, 0x05 + vpblendd xm4, xm0, 0x0a + pshufd xm1, xm1, q2301 + psubw xm4, xm8, xm4 + psubw xm4, xm1 + psrlw xm4, 2 + packuswb xm4, xm4 + movq [maskq], xm4 + RET +.w4_h16: + W_MASK 0, 5, 2, 3 + lea dstq, [dstq+strideq*4] + phaddd m4, m5 + vextracti128 xm1, m0, 1 + psubw m4, m8, m4 + psrlw m4, 2 + vpermd m4, m9, m4 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + mova [maskq], xm4 + RET +.w8_loop: + add tmp1q, 2*32 + add tmp2q, 2*32 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 8 +.w8: + vextracti128 xm2, m4, 1 + vextracti128 xm1, m0, 1 + psubw xm4, xm8, xm4 + psubw xm4, xm2 + psrlw xm4, 2 + packuswb xm4, xm4 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + movq [maskq], xm4 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w16: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + W_MASK 0, 5, 2, 3 + punpckhqdq m1, m4, m5 + punpcklqdq m4, m5 + psubw m1, m8, m1 + psubw m1, m4 + psrlw m1, 2 + vpermq m0, m0, q3120 + packuswb m1, m1 + vpermd m1, m9, m1 + mova [dstq+strideq*2], xm0 + vextracti128 [dstq+stride3q ], m0, 1 + mova [maskq], xm1 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*2] + add maskq, 16 +.w32: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], m0 + W_MASK 0, 5, 2, 3 + psubw m4, m8, m4 + psubw m4, m5 + psrlw m4, 2 + vpermq m0, m0, q3120 + packuswb m4, m4 + vpermd m4, m9, m4 + mova [dstq+strideq*1], m0 + mova [maskq], xm4 + sub hd, 2 + jg .w32_loop + RET +.w64_loop_even: + psubw m10, m8, m4 + psubw m11, m8, m5 + dec hd +.w64_loop: + add tmp1q, 4*32 + add tmp2q, 4*32 + W_MASK 0, 4, 0, 1 + add dstq, strideq +.w64: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + test hd, 1 + jz .w64_loop_even + psubw m4, m10, m4 + psubw m5, m11, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m9, m4 + mova [maskq], m4 + add maskq, 32 + dec hd + jg .w64_loop + RET +.w128_loop_even: + psubw m12, m8, m4 + psubw m13, m8, m5 + dec hd +.w128_loop: + W_MASK 0, 4, 0, 1 + add dstq, strideq +.w128: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + add tmp1q, 8*32 + add tmp2q, 8*32 + test hd, 1 + jz .w128_even + psubw m4, m10, m4 + psubw m5, m11, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m9, m4 + mova [maskq+32*0], m4 + jmp .w128_odd +.w128_even: + psubw m10, m8, m4 + psubw m11, m8, m5 +.w128_odd: + W_MASK 0, 4, -4, -3 + vpermq m0, m0, q3120 + mova [dstq+32*2], m0 + W_MASK 0, 5, -2, -1 + vpermq m0, m0, q3120 + mova [dstq+32*3], m0 + test hd, 1 + jz .w128_loop_even + psubw m4, m12, m4 + psubw m5, m13, m5 + psrlw m4, 2 + psrlw m5, 2 + packuswb m4, m5 + vpermd m4, m9, m4 + mova [maskq+32*1], m4 + add maskq, 64 + dec hd + jg .w128_loop + RET + +cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx2_table + lea r7, [w_mask_422_avx2_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + pxor m9, m9 + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + pmovzxbd m10, [base+deint_shuf4] + vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign + add wq, r7 + mov maskq, maskmp + W_MASK 0, 4, 0, 1 + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + jg .w4_h16 +.w4_end: + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + psubb xm5, xm8, xm4 + pavgb xm5, xm9 + pshufd xm5, xm5, q3120 + mova [maskq], xm5 + RET +.w4_h16: + W_MASK 0, 5, 2, 3 + lea dstq, [dstq+strideq*4] + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermd m5, m10, m5 + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + mova [maskq], m5 + RET +.w8_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 16 +.w8: + vextracti128 xm5, m4, 1 + vextracti128 xm1, m0, 1 + packuswb xm4, xm5 + psubb xm5, xm8, xm4 + pavgb xm5, xm9 + pshufd xm5, xm5, q3120 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + mova [maskq], xm5 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w16: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+strideq*2], xm0 + vextracti128 [dstq+stride3q ], m0, 1 + mova [maskq], m5 + sub hd, 4 + jg .w16_loop + RET +.w32_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1 + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w32: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], m0 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+strideq*1], m0 + mova [maskq], m5 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1 + add dstq, strideq + add maskq, 32 +.w64: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+32*1], m0 + mova [maskq], m5 + dec hd + jg .w64_loop + RET +.w128_loop: + add tmp1q, 32*8 + add tmp2q, 32*8 + W_MASK 0, 4, 0, 1 + add dstq, strideq + add maskq, 32*2 +.w128: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + W_MASK 0, 5, 2, 3 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+32*1], m0 + mova [maskq+32*0], m5 + W_MASK 0, 4, 4, 5 + vpermq m0, m0, q3120 + mova [dstq+32*2], m0 + W_MASK 0, 5, 6, 7 + packuswb m4, m5 + psubb m5, m8, m4 + pavgb m5, m9 + vpermq m0, m0, q3120 + vpermd m5, m10, m5 + mova [dstq+32*3], m0 + mova [maskq+32*1], m5 + dec hd + jg .w128_loop + RET + +cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx2_table + lea r7, [w_mask_444_avx2_table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m5, [base+pb_64] + vpbroadcastd m7, [base+pw_2048] + add wq, r7 + W_MASK 0, 4, 0, 1, 1 + lea stride3q, [strideq*3] + jmp wq +.w4: + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + mova [maskq+32*0], m4 + cmp hd, 8 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + je .w4_end + W_MASK 0, 4, 2, 3, 1 + lea dstq, [dstq+strideq*4] + vextracti128 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 + mova [maskq+32*1], m4 +.w4_end: + RET +.w8_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1, 1 + lea dstq, [dstq+strideq*4] + add maskq, 32 +.w8: + vextracti128 xm1, m0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + mova [maskq], m4 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1, 1 + lea dstq, [dstq+strideq*2] + add maskq, 32 +.w16: + vpermq m0, m0, q3120 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [maskq], m4 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + add tmp1q, 32*2 + add tmp2q, 32*2 + W_MASK 0, 4, 0, 1, 1 + add dstq, strideq + add maskq, 32 +.w32: + vpermq m0, m0, q3120 + mova [dstq], m0 + mova [maskq], m4 + dec hd + jg .w32_loop + RET +.w64_loop: + add tmp1q, 32*4 + add tmp2q, 32*4 + W_MASK 0, 4, 0, 1, 1 + add dstq, strideq + add maskq, 32*2 +.w64: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + mova [maskq+32*0], m4 + W_MASK 0, 4, 2, 3, 1 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + mova [maskq+32*1], m4 + dec hd + jg .w64_loop + RET +.w128_loop: + add tmp1q, 32*8 + add tmp2q, 32*8 + W_MASK 0, 4, 0, 1, 1 + add dstq, strideq + add maskq, 32*4 +.w128: + vpermq m0, m0, q3120 + mova [dstq+32*0], m0 + mova [maskq+32*0], m4 + W_MASK 0, 4, 2, 3, 1 + vpermq m0, m0, q3120 + mova [dstq+32*1], m0 + mova [maskq+32*1], m4 + W_MASK 0, 4, 4, 5, 1 + vpermq m0, m0, q3120 + mova [dstq+32*2], m0 + mova [maskq+32*2], m4 + W_MASK 0, 4, 6, 7, 1 + vpermq m0, m0, q3120 + mova [dstq+32*3], m0 + mova [maskq+32*3], m4 + dec hd + jg .w128_loop + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/mc_avx512.asm b/third_party/dav1d/src/x86/mc_avx512.asm new file mode 100644 index 0000000000..7897f1decc --- /dev/null +++ b/third_party/dav1d/src/x86/mc_avx512.asm @@ -0,0 +1,4538 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 + +obmc_masks: +pw_512: times 2 dw 512 + ; 2 + db 45, 19, 64, 0 + ; 4 + db 39, 25, 50, 14, 59, 5, 64, 0 + ; 8 + db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 + ; 16 + db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 + db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 + ; 32 + db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 + db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 + db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 + db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 + +warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20 + db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22 + db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24 + db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26 +warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24 + db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26 + db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28 + db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30 +warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 +warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 +pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 +warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43 +pd_16384: dd 16384 +pd_262144: dd 262144 +warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54 +warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59 + db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63 +bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31 + db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63 + db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 + db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 +wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31 + db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63 + db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 + db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 +wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47 + db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63 + db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 + db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 +wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 + db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127 + db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 +wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62 + db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126 + db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 +wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 + db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 +bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 + db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 + db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39 + db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47 +bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 + db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 + db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23 + db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31 +bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 + db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 + db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87 + db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39 +bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 + db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 + db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23 + db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31 +bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7 + db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15 + db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 + db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 +bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7 +spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 + db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 +spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 + db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42 + db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50 +spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 + db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 + db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54 +spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 + db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 +spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 + db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26 + db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34 +spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 + db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 + db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 +spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 + db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 + db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 + db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 +spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 + db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 + db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 + db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 +spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23 + db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 +spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39 + db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47 +spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55 + db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63 +spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13 + db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 +spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 + db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 + db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 + db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 +spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55 + db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63 + db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71 + db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79 +spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 + db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29 + db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45 + db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61 +spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 + db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 +spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36 + db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38 +spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 + db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46 + db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52 + db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54 +spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40 + db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42 + db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48 + db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50 +spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 + db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10 + db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16 + db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18 +spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12 + db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14 + db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20 + db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22 +spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 +deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 +subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 +subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 +bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 +resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +resize_permC: dd 0, 4, 8, 12 +pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 + +wm_420_perm64: dq 0xfedcba9876543210 +wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 + +pb_8x0_8x8: times 8 db 0 + times 8 db 8 +pb_127: times 4 db 127 +pw_m128 times 2 dw -128 +pw_m256: times 2 dw -256 +pw_1024: times 2 dw 1024 +pw_2048: times 2 dw 2048 +pw_6903: times 2 dw 6903 +pw_8192: times 2 dw 8192 +pd_32: dd 32 +pd_34: dd 34 +pd_63: dd 63 +pd_512: dd 512 +pd_32768: dd 32768 + +%define pb_m64 (wm_sign+4) +%define pb_64 (wm_sign+8) +%define pd_2 (pd_0to7+8) + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) +cextern mc_warp_filter +cextern resize_filter + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +%macro BIDIR_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) + %%table: + %rep %0 - 2 + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put) +%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep) + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 + +SECTION .text + +%macro WRAP_YMM 1+ +INIT_YMM cpuname + %1 +INIT_ZMM cpuname +%endmacro + +INIT_ZMM avx512icl +cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy + movifnidn mxyd, r6m ; mx + lea r7, [put_avx512icl] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx wd, word [r7+wq*2+table_offset(put,)] + add wq, r7 + jmp wq +.put_w2: + movzx r6d, word [srcq+ssq*0] + movzx r7d, word [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6w + mov [dstq+dsq*1], r7w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r6d, [srcq+ssq*0] + mov r7d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6d + mov [dstq+dsq*1], r7d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + mov r6, [srcq+ssq*0] + mov r7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r6 + mov [dstq+dsq*1], r7 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu xmm0, [srcq+ssq*0] + movu xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], xmm0 + mova [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu ym0, [srcq+ssq*0] + movu ym1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], ym0 + mova [dstq+dsq*1], ym1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+ssq*0+64*0] + movu m1, [srcq+ssq*0+64*1] + movu m2, [srcq+ssq*1+64*0] + movu m3, [srcq+ssq*1+64*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+64*0], m0 + mova [dstq+dsq*0+64*1], m1 + mova [dstq+dsq*1+64*0], m2 + mova [dstq+dsq*1+64*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w128 + RET +.h: + ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 + ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 + imul mxyd, 0xff01 + vbroadcasti128 m4, [bilin_h_shuf8] + add mxyd, 16 << 8 + vpbroadcastw m5, mxyd + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .hv + movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] + vpbroadcastd m3, [pw_2048] + add wq, r7 + jmp wq +.h_w2: + movd xmm0, [srcq+ssq*0] + pinsrd xmm0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pshufb xmm0, xm4 + pmaddubsw xmm0, xm5 + pmulhrsw xmm0, xm3 + packuswb xmm0, xmm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2 + RET +.h_w4: + mova xmm4, [bilin_h_shuf4] +.h_w4_loop: + movq xmm0, [srcq+ssq*0] + movhps xmm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xmm4 + pmaddubsw xmm0, xm5 + pmulhrsw xmm0, xm3 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pshufb ym0, ym4 + pmaddubsw ym0, ym5 + pmulhrsw ym0, ym3 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mova m4, [bilin_h_perm16] +.h_w16_loop: + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + vpermb m0, m4, m0 + pmaddubsw m0, m5 + pmulhrsw m0, m3 + vpmovuswb ym0, m0 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + movu ym0, [srcq+ssq*0+8*0] + vinserti32x8 m0, [srcq+ssq*1+8*0], 1 + movu ym1, [srcq+ssq*0+8*1] + vinserti32x8 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32 + RET +.h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + add srcq, ssq + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + movu m0, [srcq+8*0] + movu m2, [srcq+8*1] + movu m1, [srcq+8*8] + movu m6, [srcq+8*9] + add srcq, ssq + REPX {pshufb x, m4}, m0, m2, m1, m6 + REPX {pmaddubsw x, m5}, m0, m2, m1, m6 + REPX {pmulhrsw x, m3}, m0, m2, m1, m6 + packuswb m0, m2 + packuswb m1, m6 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] + imul mxyd, 0xff01 + vpbroadcastd m5, [pw_2048] + add mxyd, 16 << 8 + add wq, r7 + vpbroadcastw m4, mxyd + jmp wq +.v_w2: + movd xmm0, [srcq+ssq*0] +.v_w2_loop: + pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1 + lea srcq, [srcq+ssq*2] + pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1 + pshuflw xmm1, xmm1, q2301 ; 1 0 + punpcklbw xmm1, xmm0, xmm1 + pmaddubsw xmm1, xm4 + pmulhrsw xmm1, xm5 + packuswb xmm1, xmm1 + pextrw [dstq+dsq*0], xmm1, 1 + pextrw [dstq+dsq*1], xmm1, 0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xmm0, [srcq+ssq*0] +.v_w4_loop: + vpbroadcastd xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm1, xmm0, 0x02 ; 1 2 + punpcklbw xmm1, xmm2 + pmaddubsw xmm1, xm4 + pmulhrsw xmm1, xm5 + packuswb xmm1, xmm1 + movd [dstq+dsq*0], xmm1 + pextrd [dstq+dsq*1], xmm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xmm0, [srcq+ssq*0] +.v_w8_loop: + movq xmm3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw xmm1, xmm3, xmm0 + movq xmm0, [srcq+ssq*0] + punpcklbw xmm2, xmm0, xmm3 + pmaddubsw xmm1, xm4 + pmaddubsw xmm2, xm4 + pmulhrsw xmm1, xm5 + pmulhrsw xmm2, xm5 + packuswb xmm1, xmm2 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +.v_w16: + movu xmm0, [srcq+ssq*0] +.v_w16_loop: + vbroadcasti128 ymm2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1 + vbroadcasti128 ymm0, [srcq+ssq*0] + vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2 + punpcklbw ymm1, ymm2, ymm3 + punpckhbw ymm2, ymm3 + pmaddubsw ymm1, ym4 + pmaddubsw ymm2, ym4 + pmulhrsw ymm1, ym5 + pmulhrsw ymm2, ym5 + packuswb ymm1, ymm2 + mova [dstq+dsq*0], xmm1 + vextracti128 [dstq+dsq*1], ymm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + vzeroupper + RET +.v_w32: + movu ym0, [srcq+ssq*0] + kxnorb k1, k1, k1 +.v_w32_loop: + vbroadcasti32x8 m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendmd m3{k1}, m2, m0 ; 0 1 + vbroadcasti32x8 m0, [srcq+ssq*0] + vpblendmd m2{k1}, m0, m2 ; 1 2 + punpcklbw m1, m2, m3 + punpckhbw m2, m3 + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + mova [dstq+dsq*0], ym1 + vextracti32x8 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + RET +.v_w64: + movu m0, [srcq+ssq*0] +.v_w64_loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m1, m3, m0 + punpckhbw m6, m3, m0 + movu m0, [srcq+ssq*0] + pmaddubsw m1, m4 + pmaddubsw m6, m4 + punpcklbw m2, m0, m3 + punpckhbw m7, m0, m3 + pmaddubsw m2, m4 + pmaddubsw m7, m4 + REPX {pmulhrsw x, m5}, m1, m6, m2, m7 + packuswb m1, m6 + packuswb m2, m7 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + movu m0, [srcq+64*0] + movu m1, [srcq+64*1] +.v_w128_loop: + add srcq, ssq + movu m2, [srcq+64*0] + movu m3, [srcq+64*1] + punpcklbw m6, m2, m0 + pmaddubsw m6, m4 + punpckhbw m0, m2, m0 + pmaddubsw m0, m4 + punpcklbw m7, m3, m1 + pmaddubsw m7, m4 + punpckhbw m1, m3, m1 + pmaddubsw m1, m4 + REPX {pmulhrsw x, m5}, m6, m0, m7, m1 + packuswb m6, m0 + mova m0, m2 + packuswb m7, m1 + mova m1, m3 + mova [dstq+64*0], m6 + mova [dstq+64*1], m7 + add dstq, dsq + dec hd + jg .v_w128_loop + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 + ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 + movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] + WIN64_SPILL_XMM 8 + shl mxyd, 11 ; can't shift by 12 due to signed overflow + vpbroadcastd m7, [pw_2048] + add wq, r7 + vpbroadcastw m6, mxyd + jmp wq +.hv_w2: + vpbroadcastd xmm0, [srcq+ssq*0] + pshufb xmm0, xm4 + pmaddubsw xmm0, xm5 +.hv_w2_loop: + movd xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pinsrd xmm1, [srcq+ssq*0], 1 + pshufb xmm1, xm4 + pmaddubsw xmm1, xm5 ; 1 _ 2 _ + shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _ + mova xmm0, xmm1 + psubw xmm1, xmm2 + paddw xmm1, xmm1 + pmulhw xmm1, xm6 + paddw xmm1, xmm2 + pmulhrsw xmm1, xm7 + packuswb xmm1, xmm1 + pextrw [dstq+dsq*0], xmm1, 0 + pextrw [dstq+dsq*1], xmm1, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova xmm4, [bilin_h_shuf4] + movddup xmm0, [srcq+ssq*0] + pshufb xmm0, xmm4 + pmaddubsw xmm0, xm5 +.hv_w4_loop: + movq xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xmm1, [srcq+ssq*0] + pshufb xmm1, xmm4 + pmaddubsw xmm1, xm5 ; 1 2 + shufps xmm2, xmm0, xmm1, q1032 ; 0 1 + mova xmm0, xmm1 + psubw xmm1, xmm2 + paddw xmm1, xmm1 + pmulhw xmm1, xm6 + paddw xmm1, xmm2 + pmulhrsw xmm1, xm7 + packuswb xmm1, xmm1 + movd [dstq+dsq*0], xmm1 + pextrd [dstq+dsq*1], xmm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti128 ym0, [srcq+ssq*0] + pshufb ym0, ym4 + pmaddubsw ym0, ym5 +.hv_w8_loop: + movu xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti128 ym1, [srcq+ssq*0], 1 + pshufb ym1, ym4 + pmaddubsw ym1, ym5 ; 1 2 + valignq ym2, ym1, ym0, 2 + mova ym0, ym1 + psubw ym1, ym2 + paddw ym1, ym1 + pmulhw ym1, ym6 + paddw ym1, ym2 + pmulhrsw ym1, ym7 + vpmovuswb xm1, ym1 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w16: + vbroadcasti32x8 m0, [srcq+ssq*0] + mova m4, [bilin_h_perm16] + vpermb m0, m4, m0 + pmaddubsw m0, m5 +.hv_w16_loop: + movu ym1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m1, [srcq+ssq*0], 1 + vpermb m1, m4, m1 + pmaddubsw m1, m5 ; 1 2 + valignq m2, m1, m0, 4 ; 0 1 + mova m0, m1 + psubw m1, m2 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m2 + pmulhrsw m1, m7 + vpmovuswb ym1, m1 + mova [dstq+dsq*0], xm1 + vextracti32x4 [dstq+dsq*1], ym1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + RET +.hv_w32: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+ssq*0] + pmovzxbq m8, [pb_02461357] + pmaddubsw m0, m5 +.hv_w32_loop: + vpermb m2, m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpermb m3, m4, [srcq+ssq*0] + pmaddubsw m2, m5 + psubw m1, m2, m0 + paddw m1, m1 + pmulhw m1, m6 + paddw m1, m0 + pmaddubsw m0, m3, m5 + psubw m3, m0, m2 + paddw m3, m3 + pmulhw m3, m6 + paddw m3, m2 + pmulhrsw m1, m7 + pmulhrsw m3, m7 + packuswb m1, m3 + vpermq m1, m8, m1 + mova [dstq+dsq*0], ym1 + vextracti32x8 [dstq+dsq*1], m1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w32_loop + RET +.hv_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w64_loop: + add srcq, ssq + movu m2, [srcq+8*0] + movu m3, [srcq+8*1] + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + psubw m8, m2, m0 + psubw m9, m3, m1 + paddw m8, m8 + pmulhw m8, m6 + paddw m9, m9 + pmulhw m9, m6 + paddw m8, m0 + pmulhrsw m8, m7 + paddw m9, m1 + pmulhrsw m9, m7 + mova m0, m2 + mova m1, m3 + packuswb m8, m9 + mova [dstq], m8 + add dstq, dsq + dec hd + jg .hv_w64_loop + RET +.hv_w128: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + movu m2, [srcq+8*8] + movu m3, [srcq+8*9] + REPX {pshufb x, m4}, m0, m1, m2, m3 + REPX {pmaddubsw x, m5}, m0, m1, m2, m3 +.hv_w128_loop: + add srcq, ssq + movu m8, [srcq+8*0] + movu m9, [srcq+8*1] + movu m10, [srcq+8*8] + movu m11, [srcq+8*9] + REPX {pshufb x, m4}, m8, m9, m10, m11 + REPX {pmaddubsw x, m5}, m8, m9, m10, m11 + psubw m12, m8, m0 + psubw m13, m9, m1 + psubw m14, m10, m2 + psubw m15, m11, m3 + paddw m12, m12 + pmulhw m12, m6 + paddw m13, m13 + pmulhw m13, m6 + paddw m14, m14 + pmulhw m14, m6 + paddw m15, m15 + pmulhw m15, m6 + paddw m12, m0 + pmulhrsw m12, m7 + paddw m13, m1 + pmulhrsw m13, m7 + paddw m14, m2 + pmulhrsw m14, m7 + paddw m15, m3 + pmulhrsw m15, m7 + mova m0, m8 + mova m1, m9 + mova m2, m10 + mova m3, m11 + packuswb m12, m13 + packuswb m14, m15 + mova [dstq+64*0], m12 + mova [dstq+64*1], m14 + add dstq, dsq + dec hd + jg .hv_w128_loop + RET + +DECLARE_REG_TMP 3, 5, 6 + +cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + lea t2, [prep_avx512icl] + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: + movzx wd, word [t2+wq*2+table_offset(prep,)] + add wq, t2 + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movd xmm0, [srcq+strideq*0] + pinsrd xmm0, [srcq+strideq*1], 1 + pinsrd xmm0, [srcq+strideq*2], 2 + pinsrd xmm0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pmovzxbw ym0, xmm0 + psllw ym0, 4 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movq xmm0, [srcq+strideq*0] + movq xmm1, [srcq+strideq*1] + vinserti128 ym0, ymm0, [srcq+strideq*2], 1 + vinserti128 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq ym0, ym1 + pmovzxbw m0, ym0 + psllw m0, 4 + mova [tmpq], m0 + add tmpq, 32*2 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu xmm0, [srcq+strideq*0] + vinserti128 ym0, ymm0, [srcq+strideq*1], 1 + movu xmm1, [srcq+strideq*2] + vinserti128 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + pmovzxbw m0, ym0 + pmovzxbw m1, ym1 + psllw m0, 4 + psllw m1, 4 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 32*4 + sub hd, 4 + jg .prep_w16 + RET +.prep_w32: + pmovzxbw m0, [srcq+strideq*0] + pmovzxbw m1, [srcq+strideq*1] + pmovzxbw m2, [srcq+strideq*2] + pmovzxbw m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + REPX {psllw x, 4}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 4 + jg .prep_w32 + RET +.prep_w64: + pmovzxbw m0, [srcq+strideq*0+32*0] + pmovzxbw m1, [srcq+strideq*0+32*1] + pmovzxbw m2, [srcq+strideq*1+32*0] + pmovzxbw m3, [srcq+strideq*1+32*1] + lea srcq, [srcq+strideq*2] + REPX {psllw x, 4}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 2 + jg .prep_w64 + RET +.prep_w128: + pmovzxbw m0, [srcq+32*0] + pmovzxbw m1, [srcq+32*1] + pmovzxbw m2, [srcq+32*2] + pmovzxbw m3, [srcq+32*3] + REPX {psllw x, 4}, m0, m1, m2, m3 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + add srcq, strideq + dec hd + jg .prep_w128 + RET +.h: + ; 16 * src[x] + (mx * (src[x + 1] - src[x])) + ; = (16 - mx) * src[x] + mx * src[x + 1] + imul mxyd, 0xff01 + add mxyd, 16 << 8 + vpbroadcastw m5, mxyd + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .hv + movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] + add wq, t2 + lea stride3q, [strideq*3] + jmp wq +.h_w4: + vbroadcasti32x4 ym4, [bilin_h_shuf4] +.h_w4_loop: + movq xmm0, [srcq+strideq*0] + movq xmm1, [srcq+strideq*1] + vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1 + vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + punpcklqdq ym0, ym1 + pshufb ym0, ym4 + pmaddubsw ym0, ym5 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + vbroadcasti32x4 m4, [bilin_h_shuf8] +.h_w8_loop: + movu xmm0, [srcq+strideq*0] + vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 + vinserti32x4 m0, [srcq+strideq*2], 2 + vinserti32x4 m0, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pshufb m0, m4 + pmaddubsw m0, m5 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + mova m4, [bilin_h_perm16] +.h_w16_loop: + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 + movu ym1, [srcq+strideq*2] + vinserti32x8 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vpermb m0, m4, m0 + vpermb m1, m4, m1 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + add tmpq, 64*2 + sub hd, 4 + jg .h_w16_loop + RET +.h_w32: + mova m4, [bilin_h_perm32] +.h_w32_loop: + vpermb m0, m4, [srcq+strideq*0] + vpermb m1, m4, [srcq+strideq*1] + vpermb m2, m4, [srcq+strideq*2] + vpermb m3, m4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 4 + jg .h_w32_loop + RET +.h_w64: + mova m4, [bilin_h_perm32] +.h_w64_loop: + vpermb m0, m4, [srcq+strideq*0+32*0] + vpermb m1, m4, [srcq+strideq*0+32*1] + vpermb m2, m4, [srcq+strideq*1+32*0] + vpermb m3, m4, [srcq+strideq*1+32*1] + lea srcq, [srcq+strideq*2] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + sub hd, 2 + jg .h_w64_loop + RET +.h_w128: + mova m4, [bilin_h_perm32] +.h_w128_loop: + vpermb m0, m4, [srcq+32*0] + vpermb m1, m4, [srcq+32*1] + vpermb m2, m4, [srcq+32*2] + vpermb m3, m4, [srcq+32*3] + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 + mova [tmpq+64*2], m2 + mova [tmpq+64*3], m3 + add tmpq, 64*4 + add srcq, strideq + dec hd + jg .h_w128_loop + RET +.v: + WIN64_SPILL_XMM 7 + movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] + imul mxyd, 0xff01 + add mxyd, 16 << 8 + add wq, t2 + lea stride3q, [strideq*3] + vpbroadcastw m6, mxyd + jmp wq +.v_w4: + vpbroadcastd xm0, [srcq+strideq*0] + mov r3d, 0x29 + vbroadcasti32x4 ym3, [bilin_v_shuf4] + kmovb k1, r3d +.v_w4_loop: + vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____ + vpbroadcastd ym2, [srcq+strideq*2] + vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__ + lea srcq, [srcq+strideq*4] + vpbroadcastd ym0, [srcq+strideq*0] + punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_ + pshufb ym2, ym3 + pmaddubsw ym2, ym6 + mova [tmpq], ym2 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + mova m5, [bilin_v_perm8] + vbroadcasti32x4 ym0, [srcq+strideq*0] +.v_w8_loop: + vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 + vpbroadcastq ym0, [srcq+strideq*2] + vinserti32x4 m1, [srcq+stride3q ], 2 + lea srcq, [srcq+strideq*4] + vinserti32x4 ym0, [srcq+strideq*0], 0 + vpermt2b m1, m5, m0 + pmaddubsw m1, m6 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mova m5, [bilin_v_perm16] + movu xm0, [srcq+strideq*0] +.v_w16_loop: + movu xm2, [srcq+strideq*2] + vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 + vpermt2b m1, m5, m2 + vinserti32x4 ym2, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + movu xm0, [srcq+strideq*0] + vpermt2b m2, m5, m0 + pmaddubsw m1, m6 + pmaddubsw m2, m6 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + mova m5, [bilin_v_perm32] + movu ym0, [srcq+strideq*0] +.v_w32_loop: + movu ym2, [srcq+strideq*1] + movu ym3, [srcq+strideq*2] + movu ym4, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpermt2b m0, m5, m2 + vpermt2b m2, m5, m3 + vpermt2b m3, m5, m4 + pmaddubsw m1, m0, m6 + movu ym0, [srcq+strideq*0] + vpermt2b m4, m5, m0 + pmaddubsw m2, m6 + pmaddubsw m3, m6 + pmaddubsw m4, m6 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m2 + mova [tmpq+64*2], m3 + mova [tmpq+64*3], m4 + add tmpq, 64*4 + sub hd, 4 + jg .v_w32_loop + RET +.v_w64: + mova m5, [bilin_v_perm64] + vpermq m0, m5, [srcq+strideq*0] +.v_w64_loop: + vpermq m1, m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklbw m4, m1, m0 + punpckhbw m2, m1, m0 + vpermq m0, m5, [srcq+strideq*0] + punpcklbw m3, m0, m1 + punpckhbw m1, m0, m1 + pmaddubsw m4, m6 + pmaddubsw m2, m6 + pmaddubsw m3, m6 + pmaddubsw m1, m6 + mova [tmpq+64*0], m4 + mova [tmpq+64*1], m2 + mova [tmpq+64*2], m3 + mova [tmpq+64*3], m1 + add tmpq, 64*4 + sub hd, 2 + jg .v_w64_loop + RET +.v_w128: + mova m5, [bilin_v_perm64] + vpermq m0, m5, [srcq+strideq*0+ 0] + vpermq m1, m5, [srcq+strideq*0+64] +.v_w128_loop: + vpermq m2, m5, [srcq+strideq*1+ 0] + vpermq m3, m5, [srcq+strideq*1+64] + lea srcq, [srcq+strideq*2] + punpcklbw m4, m2, m0 + punpckhbw m0, m2, m0 + pmaddubsw m4, m6 + pmaddubsw m0, m6 + mova [tmpq+64*0], m4 + mova [tmpq+64*1], m0 + punpcklbw m4, m3, m1 + punpckhbw m1, m3, m1 + pmaddubsw m4, m6 + pmaddubsw m1, m6 + mova [tmpq+64*2], m4 + mova [tmpq+64*3], m1 + vpermq m0, m5, [srcq+strideq*0+ 0] + vpermq m1, m5, [srcq+strideq*0+64] + punpcklbw m4, m0, m2 + punpckhbw m2, m0, m2 + pmaddubsw m4, m6 + pmaddubsw m2, m6 + mova [tmpq+64*4], m4 + mova [tmpq+64*5], m2 + punpcklbw m4, m1, m3 + punpckhbw m3, m1, m3 + pmaddubsw m4, m6 + pmaddubsw m3, m6 + mova [tmpq+64*6], m4 + mova [tmpq+64*7], m3 + add tmpq, 64*8 + sub hd, 2 + jg .v_w128_loop + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 + ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 7 + movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] + shl mxyd, 11 + vpbroadcastw m6, mxyd + add wq, t2 + lea stride3q, [strideq*3] + jmp wq +.hv_w4: + vbroadcasti32x4 ym4, [bilin_h_shuf4] + vpbroadcastq ym0, [srcq+strideq*0] + pshufb ym0, ym4 + pmaddubsw ym0, ym5 +.hv_w4_loop: + movq xmm1, [srcq+strideq*1] + movq xmm2, [srcq+strideq*2] + vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1 + punpcklqdq ym1, ym2 + pshufb ym1, ym4 + pmaddubsw ym1, ym5 ; 1 2 3 4 + valignq ym2, ym1, ym0, 3 ; 0 1 2 3 + mova ym0, ym1 + psubw ym1, ym2 + pmulhrsw ym1, ym6 + paddw ym1, ym2 + mova [tmpq], ym1 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + vbroadcasti32x4 m4, [bilin_h_shuf8] + vbroadcasti32x4 m0, [srcq+strideq*0] + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu xmm1, [srcq+strideq*1] + vinserti128 ym1, ymm1, [srcq+strideq*2], 1 + vinserti128 m1, [srcq+stride3q ], 2 + lea srcq, [srcq+strideq*4] + vinserti128 m1, [srcq+strideq*0], 3 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 3 4 + valignq m2, m1, m0, 6 ; 0 1 2 3 + mova m0, m1 + psubw m1, m2 + pmulhrsw m1, m6 + paddw m1, m2 + mova [tmpq], m1 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + mova m4, [bilin_h_perm16] + vbroadcasti32x8 m0, [srcq+strideq*0] + vpermb m0, m4, m0 + pmaddubsw m0, m5 +.hv_w16_loop: + movu ym1, [srcq+strideq*1] + vinserti32x8 m1, [srcq+strideq*2], 1 + movu ym2, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti32x8 m2, [srcq+strideq*0], 1 + vpermb m1, m4, m1 + vpermb m2, m4, m2 + pmaddubsw m1, m5 ; 1 2 + vshufi32x4 m3, m0, m1, q1032 ; 0 1 + pmaddubsw m0, m2, m5 ; 3 4 + vshufi32x4 m2, m1, m0, q1032 ; 2 3 + psubw m1, m3 + pmulhrsw m1, m6 + paddw m1, m3 + psubw m3, m0, m2 + pmulhrsw m3, m6 + paddw m3, m2 + mova [tmpq+64*0], m1 + mova [tmpq+64*1], m3 + add tmpq, 64*2 + sub hd, 4 + jg .hv_w16_loop + RET +.hv_w32: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+strideq*0] + pmaddubsw m0, m5 +.hv_w32_loop: + vpermb m1, m4, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermb m2, m4, [srcq+strideq*0] + pmaddubsw m1, m5 + psubw m3, m1, m0 + pmulhrsw m3, m6 + paddw m3, m0 + pmaddubsw m0, m2, m5 + psubw m2, m0, m1 + pmulhrsw m2, m6 + paddw m2, m1 + mova [tmpq+64*0], m3 + mova [tmpq+64*1], m2 + add tmpq, 64*2 + sub hd, 2 + jg .hv_w32_loop + RET +.hv_w64: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+32*0] + vpermb m1, m4, [srcq+32*1] + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w64_loop: + add srcq, strideq + vpermb m2, m4, [srcq+32*0] + vpermb m3, m4, [srcq+32*1] + pmaddubsw m2, m5 + pmaddubsw m3, m5 + psubw m7, m2, m0 + psubw m8, m3, m1 + pmulhrsw m7, m6 + pmulhrsw m8, m6 + paddw m7, m0 + mova m0, m2 + paddw m8, m1 + mova m1, m3 + mova [tmpq+64*0], m7 + mova [tmpq+64*1], m8 + add tmpq, 64*2 + dec hd + jg .hv_w64_loop + RET +.hv_w128: + mova m4, [bilin_h_perm32] + vpermb m0, m4, [srcq+32*0] + vpermb m1, m4, [srcq+32*1] + vpermb m2, m4, [srcq+32*2] + vpermb m3, m4, [srcq+32*3] + REPX {pmaddubsw x, m5}, m0, m1, m2, m3 +.hv_w128_loop: + add srcq, strideq + vpermb m7, m4, [srcq+32*0] + vpermb m8, m4, [srcq+32*1] + vpermb m9, m4, [srcq+32*2] + vpermb m10, m4, [srcq+32*3] + REPX {pmaddubsw x, m5}, m7, m8, m9, m10 + psubw m11, m7, m0 + psubw m12, m8, m1 + psubw m13, m9, m2 + psubw m14, m10, m3 + REPX {pmulhrsw x, m6}, m11, m12, m13, m14 + paddw m11, m0 + mova m0, m7 + paddw m12, m1 + mova m1, m8 + paddw m13, m2 + mova m2, m9 + paddw m14, m3 + mova m3, m10 + mova [tmpq+64*0], m11 + mova [tmpq+64*1], m12 + mova [tmpq+64*2], m13 + mova [tmpq+64*3], m14 + add tmpq, 64*4 + dec hd + jg .hv_w128_loop + RET + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; fn, type, type_h, type_v +cglobal %1_%2_8bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) +%endif +%endmacro + +%macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb +%if %5 + vpermb m%2, m6, m%1 + vpermb m%3, m7, m%1 + vpermb m%4, m8, m%1 +%else +%if %2 < %4 ; reuse a previous value if possible + pshufb m%2, m%1, m6 +%endif + pshufb m%3, m%1, m7 + pshufb m%4, m%1, m8 +%endif + mova m%1, m5 + vpdpbusd m%1, m%2, m9 + mova m%2, m5 + vpdpbusd m%2, m%3, m9 + vpdpbusd m%1, m%3, m10 + vpdpbusd m%2, m%4, m10 + packusdw m%1, m%2 + psrlw m%1, 6 +%endmacro + +%if WIN64 +DECLARE_REG_TMP 4, 5 +%else +DECLARE_REG_TMP 7, 8 +%endif + +%define PUT_8TAP_FN FN put_8tap, + +PUT_8TAP_FN sharp, SHARP, SHARP +PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH +PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_FN regular, REGULAR, REGULAR + +cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 +%define base r8-put_avx512icl + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r8, [put_avx512icl] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r8+wq*2+table_offset(put,)] + add wq, r8 + lea r6, [ssq*3] + lea r7, [dsq*3] +%if WIN64 + pop r8 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) + WIN64_SPILL_XMM 11 + cmp wd, 4 + jl .h_w2 + vbroadcasti128 m6, [subpel_h_shufA] + je .h_w4 + tzcnt wd, wd + vbroadcasti128 m7, [subpel_h_shufB] + vbroadcasti128 m8, [subpel_h_shufC] + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] + vpbroadcastd m9, [base+mxq*8+subpel_filters+0] + vpbroadcastd m10, [base+mxq*8+subpel_filters+4] + add wq, r8 + jmp wq +.h_w2: + movzx mxd, mxb + dec srcq + mova xmm4, [subpel_h_shuf4] + vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] +.h_w2_loop: + movq xmm0, [srcq+ssq*0] + movhps xmm0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xmm4 + mova xmm1, xm5 + vpdpbusd xmm1, xmm0, xmm3 + packssdw xmm0, xmm1, xmm1 + psraw xmm0, 6 + packuswb xmm0, xm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] +.h_w4_loop: + movq xmm0, [srcq+ssq*0] + movq xmm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb xmm0, xm6 + pshufb xmm1, xm6 + mova xmm2, xm5 + vpdpbusd xmm2, xmm0, xmm3 + mova xmm0, xm5 + vpdpbusd xmm0, xmm1, xmm3 + packssdw xmm0, xmm2, xmm0 + psraw xmm0, 6 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +.h_w8: + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + mova m6, [spel_h_perm16a] + mova m7, [spel_h_perm16b] + mova m8, [spel_h_perm16c] +.h_w16_loop: + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 1, 2, 3, 1 + vpmovuswb ym0, m0 + mova [dstq+dsq*0], xm0 + vextracti128 [dstq+dsq*1], ym0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w16_loop + RET +.h_w32: + movu ym0, [srcq+ssq*0+8*0] + vinserti32x8 m0, [srcq+ssq*1+8*0], 1 + movu ym1, [srcq+ssq*0+8*1] + vinserti32x8 m1, [srcq+ssq*1+8*1], 1 + lea srcq, [srcq+ssq*2] + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w32 + RET +.h_w64: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + PUT_8TAP_H 0, 2, 3, 4 + PUT_8TAP_H 1, 4, 3, 2 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + movu m0, [srcq+8*0] + movu m2, [srcq+8*1] + movu m1, [srcq+8*8] + movu m3, [srcq+8*9] + add srcq, ssq + PUT_8TAP_H 0, 4, 11, 12 + PUT_8TAP_H 2, 12, 11, 4 + PUT_8TAP_H 1, 4, 11, 12 + PUT_8TAP_H 3, 12, 11, 4 + packuswb m0, m2 + packuswb m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + tzcnt r6d, wd + movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] + vpbroadcastd m7, [pw_512] + lea myq, [base+subpel_filters+myq*8] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + add r6, r8 + lea ss3q, [ssq*3] + sub srcq, ss3q + jmp r6 +.v_w2: + movd xmm2, [srcq+ssq*0] + pinsrw xmm2, [srcq+ssq*1], 2 + pinsrw xmm2, [srcq+ssq*2], 4 + add srcq, ss3q + pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm3, xmm1 ; 45 56 + punpcklbw xmm1, xmm2, xmm4 ; 01 12 + punpckhbw xmm2, xmm4 ; 23 34 +.v_w2_loop: + pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm9 ; a1 b1 + paddw xmm5, xmm2 + mova xmm2, xmm3 + pmaddubsw xmm3, xm10 ; a2 b2 + paddw xmm5, xmm3 + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 + punpcklbw xmm3, xmm4 ; 67 78 + pmaddubsw xmm4, xmm3, xm11 ; a3 b3 + paddw xmm5, xmm4 + pmulhrsw xmm5, xm7 + packuswb xmm5, xmm5 + pextrw [dstq+dsq*0], xmm5, 0 + pextrw [dstq+dsq*1], xmm5, 2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd xmm2, [srcq+ssq*0] + pinsrd xmm2, [srcq+ssq*1], 1 + pinsrd xmm2, [srcq+ssq*2], 2 + add srcq, ss3q + pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 + movd xmm3, [srcq+ssq*1] + vpbroadcastd xmm1, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 + vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 + palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 + punpcklbw xmm3, xmm1 ; 45 56 + punpcklbw xmm1, xmm2, xmm4 ; 01 12 + punpckhbw xmm2, xmm4 ; 23 34 +.v_w4_loop: + vpbroadcastd xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw xmm5, xmm1, xm8 ; a0 b0 + mova xmm1, xmm2 + pmaddubsw xmm2, xm9 ; a1 b1 + paddw xmm5, xmm2 + mova xmm2, xmm3 + pmaddubsw xmm3, xm10 ; a2 b2 + paddw xmm5, xmm3 + vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 + vpbroadcastd xmm0, [srcq+ssq*0] + vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 + punpcklbw xmm3, xmm4 ; 67 78 + pmaddubsw xmm4, xmm3, xm11 ; a3 b3 + paddw xmm5, xmm4 + pmulhrsw xmm5, xm7 + packuswb xmm5, xmm5 + movd [dstq+dsq*0], xmm5 + pextrd [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq xmm1, [srcq+ssq*0] + vpbroadcastq ymm0, [srcq+ssq*1] + vpbroadcastq ymm2, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq ymm5, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm4, [srcq+ssq*2] + add srcq, ss3q + vpblendd ymm1, ymm0, 0x30 + vpblendd ymm0, ymm2, 0x30 + punpcklbw ymm1, ymm0 ; 01 12 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm2, ymm5, 0x30 + vpblendd ymm5, ymm3, 0x30 + punpcklbw ymm2, ymm5 ; 23 34 + vpblendd ymm3, ymm4, 0x30 + vpblendd ymm4, ymm0, 0x30 + punpcklbw ymm3, ymm4 ; 45 56 +.v_w8_loop: + vpbroadcastq ymm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw ymm5, ymm1, ym8 ; a0 b0 + mova ymm1, ymm2 + pmaddubsw ymm2, ym9 ; a1 b1 + paddw ymm5, ymm2 + mova ymm2, ymm3 + pmaddubsw ymm3, ym10 ; a2 b2 + paddw ymm5, ymm3 + vpblendd ymm3, ymm0, ymm4, 0x30 + vpbroadcastq ymm0, [srcq+ssq*0] + vpblendd ymm4, ymm4, ymm0, 0x30 + punpcklbw ymm3, ymm4 ; 67 78 + pmaddubsw ymm4, ymm3, ym11 ; a3 b3 + paddw ymm5, ymm4 + pmulhrsw ymm5, ym7 + vextracti128 xmm4, ymm5, 1 + packuswb xmm5, xmm4 + movq [dstq+dsq*0], xmm5 + movhps [dstq+dsq*1], xmm5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + vzeroupper + RET +.v_w16: + mova m12, [spel_v_perm16] + vbroadcasti32x4 m1, [srcq+ssq*0] + vbroadcasti32x4 ym4, [srcq+ssq*1] + mov r6d, 0x0f + vbroadcasti32x4 m2, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti32x4 ym5, [srcq+ssq*0] + kmovb k1, r6d + vbroadcasti32x4 m3, [srcq+ssq*1] + vbroadcasti32x4 ym6, [srcq+ssq*2] + add srcq, ss3q + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m1{k1}, m4, m2, 0xcc + vshufpd m2{k1}, m5, m3, 0xcc + vshufpd m3{k1}, m6, m0, 0xcc + vpermb m1, m12, m1 ; 01 12 + vpermb m2, m12, m2 ; 23 34 + vpermb m3, m12, m3 ; 45 56 +.v_w16_loop: + pmaddubsw m4, m1, m8 ; a0 b0 + mova m1, m2 + pmaddubsw m5, m2, m9 ; a1 b1 + mova m2, m3 + pmaddubsw m6, m3, m10 ; a2 b2 + mova m3, m0 + paddw m4, m5 + vbroadcasti32x4 ym5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m0, [srcq+ssq*0] + vshufpd m3{k1}, m5, m0, 0xcc + vpermb m3, m12, m3 ; 67 78 + pmaddubsw m5, m3, m11 ; a3 b3 + paddw m4, m6 + paddw m4, m5 + pmulhrsw m4, m7 + vextracti32x8 ym5, m4, 1 + packuswb ym4, ym5 + mova [dstq+dsq*0], xm4 + vextracti32x4 [dstq+dsq*1], ym4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w16_loop + RET +.v_w32: + mova m12, [spel_v_perm32] + pmovzxbq m14, [pb_02461357] + vpshrdw m13, m12, m12, 8 + movu ym0, [srcq+ssq*0] + vinserti32x8 m0, [srcq+ssq*1], 1 + vpermb m1, m12, m0 ; 01 + vinserti32x8 m0, [srcq+ssq*2], 0 + add srcq, ss3q + vpermb m2, m13, m0 ; 12 + vinserti32x8 m0, [srcq+ssq*0], 1 + vpermb m3, m12, m0 ; 23 + vinserti32x8 m0, [srcq+ssq*1], 0 + vpermb m4, m13, m0 ; 34 + vinserti32x8 m0, [srcq+ssq*2], 1 + add srcq, ss3q + vpermb m5, m12, m0 ; 45 + vinserti32x8 m0, [srcq+ssq*0], 0 + vpermb m6, m13, m0 ; 56 +.v_w32_loop: + vinserti32x8 m0, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddubsw m15, m1, m8 + mova m1, m3 + pmaddubsw m16, m2, m8 + mova m2, m4 + pmaddubsw m17, m3, m9 + mova m3, m5 + pmaddubsw m18, m4, m9 + mova m4, m6 + pmaddubsw m19, m5, m10 + vpermb m5, m12, m0 ; 67 + vinserti32x8 m0, [srcq+ssq*0], 0 + pmaddubsw m20, m6, m10 + vpermb m6, m13, m0 ; 78 + paddw m15, m17 + pmaddubsw m17, m5, m11 + paddw m16, m18 + pmaddubsw m18, m6, m11 + paddw m15, m19 + paddw m16, m20 + paddw m15, m17 + paddw m16, m18 + pmulhrsw m15, m7 + pmulhrsw m16, m7 + packuswb m15, m16 + vpermq m15, m14, m15 + mova [dstq+dsq*0], ym15 + vextracti32x8 [dstq+dsq*1], m15, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: +.v_w128: + lea r6d, [hq+wq*4-256] + mov r4, srcq + mov r7, dstq +.v_loop0: + movu m2, [srcq+ssq*0] + movu m4, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + movu m13, [srcq+ssq*0] + movu m15, [srcq+ssq*1] + movu m17, [srcq+ssq*2] + add srcq, ss3q + movu m0, [srcq+ssq*0] + punpcklbw m1, m2, m4 ; 01l + punpckhbw m2, m4 ; 01h + punpcklbw m3, m4, m6 ; 12l + punpckhbw m4, m6 ; 12h + punpcklbw m5, m6, m13 ; 23l + punpckhbw m6, m13 ; 23h + punpcklbw m12, m13, m15 ; 34l + punpckhbw m13, m15 ; 34h + punpcklbw m14, m15, m17 ; 45l + punpckhbw m15, m17 ; 45h + punpcklbw m16, m17, m0 ; 56l + punpckhbw m17, m0 ; 56h +.v_loop: + pmaddubsw m18, m1, m8 ; a0l + mova m1, m5 + pmaddubsw m19, m2, m8 ; a0h + mova m2, m6 + pmaddubsw m20, m3, m8 ; b0l + mova m3, m12 + pmaddubsw m21, m4, m8 ; b0h + mova m4, m13 + pmaddubsw m5, m9 ; a1l + pmaddubsw m6, m9 ; a1h + pmaddubsw m12, m9 ; b1l + pmaddubsw m13, m9 ; b1h + paddw m18, m5 + mova m5, m14 + pmaddubsw m14, m10 ; a2l + paddw m19, m6 + mova m6, m15 + pmaddubsw m15, m10 ; a2h + paddw m20, m12 + mova m12, m16 + pmaddubsw m16, m10 ; b2l + paddw m21, m13 + mova m13, m17 + pmaddubsw m17, m10 ; b2h + paddw m18, m14 + paddw m19, m15 + paddw m20, m16 + paddw m21, m17 + movu m17, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m14, m0, m17 ; 67l + punpckhbw m15, m0, m17 ; 67h + pmaddubsw m16, m14, m11 ; a3l + pmaddubsw m0, m15, m11 ; a3h + paddw m18, m16 + paddw m19, m0 + movu m0, [srcq+ssq*0] + punpcklbw m16, m17, m0 ; 78l + punpckhbw m17, m0 ; 78h + pmulhrsw m18, m7 + pmulhrsw m19, m7 + packuswb m18, m19 + mova [dstq+dsq*0], m18 + pmaddubsw m18, m16, m11 ; b3l + pmaddubsw m19, m17, m11 ; b3h + paddw m18, m20 + paddw m19, m21 + pmulhrsw m18, m7 + pmulhrsw m19, m7 + packuswb m18, m19 + mova [dstq+dsq*1], m18 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_loop + add r4, 64 + add r7, 64 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 256 + jg .v_loop0 + vzeroupper + RET +.hv: + cmp wd, 4 + jg .hv_w8 + movzx mxd, mxb + dec srcq + vpbroadcastd m7, [base+subpel_filters+mxq*8+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastd m8, [pd_2] + vpbroadcastq ym0, [base+subpel_filters+myq*8] + lea ss3q, [ssq*3] + vpbroadcastd ym9, [pd_32768] + mov r6, srcq + punpcklbw ym0, ym8, ym0 + sub r6, ss3q + psraw ym0, 2 ; << 6 + mova xm14, [spel_hv_end] + pshufd ym10, ym0, q0000 + pshufd ym11, ym0, q1111 + pshufd ym12, ym0, q2222 + pshufd ym13, ym0, q3333 + cmp wd, 4 + je .hv_w4 + vbroadcasti128 ym6, [subpel_h_shuf4] + movq xmm2, [r6+ssq*0] + movhps xmm2, [r6+ssq*1] + movq xmm0, [r6+ssq*2] + movhps xmm0, [srcq+ssq*0] + vpbroadcastq ymm3, [srcq+ssq*1] + vpbroadcastq ymm4, [srcq+ssq*2] + add srcq, ss3q + vpbroadcastq ymm1, [srcq+ssq*0] + vpblendd ymm2, ymm3, 0x30 + vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ + vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 + pshufb ymm2, ym6 + pshufb ymm0, ym6 + mova ymm1, ym8 + vpdpbusd ymm1, ymm2, ym7 + mova ymm2, ym8 + vpdpbusd ymm2, ymm0, ym7 + packssdw ymm2, ymm1, ymm2 + psraw ymm2, 2 + vextracti128 xmm3, ymm2, 1 + palignr xmm4, xmm3, xmm2, 4 + punpcklwd xmm1, xmm2, xmm4 ; 01 12 + punpckhwd xmm2, xmm4 ; 23 34 + pshufd xmm0, xmm3, q2121 + punpcklwd xmm3, xmm0 ; 45 56 +.hv_w2_loop: + movq xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps xmm4, [srcq+ssq*0] + mova xmm5, xm9 + vpdpwssd xmm5, xmm1, xm10 ; a0 b0 + mova xmm1, xmm2 + vpdpwssd xmm5, xmm2, xm11 ; a1 b1 + pshufb xmm4, xm6 + mova xmm2, xmm3 + vpdpwssd xmm5, xmm3, xm12 ; a2 b2 + mova xmm3, xm8 + vpdpbusd xmm3, xmm4, xm7 + packssdw xmm4, xmm3, xmm3 + psraw xmm4, 2 + palignr xmm3, xmm4, xmm0, 12 + mova xmm0, xmm4 + punpcklwd xmm3, xmm4 ; 67 78 + vpdpwssd xmm5, xmm3, xm13 ; a3 b3 + packuswb xmm5, xmm5 + pshufb xmm5, xm14 + pextrw [dstq+dsq*0], xmm5, 0 + pextrw [dstq+dsq*1], xmm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + vzeroupper + RET +.hv_w4: + movq xmm1, [r6+ssq*0] + vpbroadcastq ym2, [r6+ssq*1] + vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 + vinserti32x4 m2, [srcq+ssq*0], 2 + vinserti32x4 m1, [srcq+ssq*1], 2 + vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 + vbroadcasti32x4 m6, [subpel_h_shufA] + add srcq, ss3q + vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 + pshufb m2, m6 + pshufb m1, m6 + mova m0, m8 + vpdpbusd m0, m2, m7 + mova m4, m8 + vpdpbusd m4, m1, m7 + mova ym1, [spel_hv_perm4a] + mova ym2, [spel_hv_perm4b] + mova ym3, [spel_hv_perm4c] + packssdw m0, m4 + psraw m0, 2 ; _ 0 1 2 3 4 5 6 + mov r6d, 0x5555 + vpermb ym1, ym1, ym0 ; 01 12 + vpermb m2, m2, m0 ; 23 34 + vpermb m3, m3, m0 ; 45 56 + kmovw k1, r6d + mova ym15, [spel_hv_perm4d] +.hv_w4_loop: + movq xmm4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1 + mova ym5, ym9 + vpdpwssd ym5, ym1, ym10 ; a0 b0 + mova ym1, ym2 + pshufb ym4, ym6 + mova ym0, ym8 + vpdpbusd ym0, ym4, ym7 + vpdpwssd ym5, ym2, ym11 ; a1 b1 + mova ym2, ym3 + vpdpwssd ym5, ym3, ym12 ; a2 b2 + vpsraw ym3{k1}, ym0, 2 ; 7 8 + vpermb ym3, ym15, ym3 ; 67 78 + vpdpwssd ym5, ym3, ym13 ; a3 b3 + packuswb ym5, ym5 + vpermb ym5, ym14, ym5 + movd [dstq+dsq*0], xm5 + pextrd [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [base+subpel_filters+mxq*8+0] + vpbroadcastd m11, [base+subpel_filters+mxq*8+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + vpbroadcastd m8, [pd_2] + vpbroadcastq m0, [base+subpel_filters+myq*8] + vpbroadcastd m9, [pd_32768] + punpcklbw m0, m8, m0 + lea ss3q, [ssq*3] + psraw m0, 2 ; << 6 + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + cmp wd, 8 + jne .hv_w16 + mov r6, srcq + sub r6, ss3q + movu xmm1, [r6+ssq*0] + vinserti128 ymm1, [r6+ssq*1], 1 + movu xmm2, [srcq+ssq*1] + vinserti32x4 m6, zmm1, [r6+ssq*2], 2 + vinserti128 ymm2, [srcq+ssq*2], 1 + vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3 + add srcq, ss3q + vbroadcasti32x4 m4, [subpel_h_shufA] + vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ + vbroadcasti32x4 m7, [subpel_h_shufB] + vbroadcasti32x4 m17, [subpel_h_shufC] + pshufb m1, m6, m4 ; 0 1 2 3 0123 + mova m2, m8 + vpdpbusd m2, m1, m10 + pshufb m5, m6, m7 ; 0 1 2 3 4567 + mova m1, m8 + vpdpbusd m1, m5, m10 + pshufb m4, m0, m4 ; 4 5 6 _ 0123 + mova m3, m8 + vpdpbusd m3, m4, m10 + pshufb m7, m0, m7 ; 4 5 6 _ 4567 + mova m4, m8 + vpdpbusd m4, m7, m10 + pshufb m6, m17 + vpdpbusd m2, m5, m11 + vpdpbusd m1, m6, m11 + pshufb m6, m0, m17 + vpdpbusd m3, m7, m11 + vpdpbusd m4, m6, m11 + mova m5, [spel_hv_perm8a] + mova m0, [spel_hv_perm8b] + mov r6, 0x55555555ff00 + packssdw m2, m1 + packssdw m3, m4 + mova m18, [spel_hv_perm8c] + psraw m2, 2 ; 0 1 2 3 + psraw m3, 2 ; 4 5 6 _ + vpermb m1, m5, m2 ; 01 12 + vbroadcasti32x8 m6, [subpel_h_shufA] + kmovq k1, r6 + vpermt2b m2, m0, m3 ; 23 34 + vbroadcasti32x8 m7, [subpel_h_shufB] + kshiftrq k2, k1, 16 + mova xm16, [spel_hv_end] + vpermb m3, m5, m3 ; 45 56 +.hv_w8_loop: + vbroadcasti32x4 ym4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vbroadcasti32x4 m4{k1}, [srcq+ssq*0] + mova m0, m9 + vpdpwssd m0, m1, m12 ; a0 b0 + pshufb m1, m4, m6 ; 7 8 0123 4567 + mova m5, m8 + vpdpbusd m5, m1, m10 + pshufb m4, m7 ; 7 8 4567 89ab + vpdpwssd m0, m2, m13 ; a1 b1 + mova m1, m2 + vpdpbusd m5, m4, m11 + mova m2, m3 + vpdpwssd m0, m3, m14 ; a2 b2 + psraw m3{k2}, m5, 2 ; 75 86 + vpermb m3, m18, m3 ; 67 78 + vpdpwssd m0, m3, m15 ; a3 b3 + packuswb m0, m0 + vpermb zmm1, m16, m0 + movq [dstq+dsq*0], xmm1 + movhps [dstq+dsq*1], xmm1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + vzeroupper + RET +.hv_w16: + movu m7, [spel_hv_perm16a] + sub srcq, ss3q + mova m20, [spel_hv_perm16b] + lea r6d, [wq*2-32] + mova m21, [spel_hv_perm16c] + mov r4, srcq + mov r7, dstq + mova ym16, [spel_hv_end16] + lea r6d, [hq+r6*8] +.hv_w16_loop0: + movu ym17, [srcq+ssq*0] + vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 + movu ym18, [srcq+ssq*2] + add srcq, ss3q + vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3 + movu ym19, [srcq+ssq*1] + vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5 + add srcq, ss3q + vpermb m2, m7, m17 ; 0 1 0123 89ab + vpermb m0, m20, m17 ; 0 1 4567 cdef + vpermb m4, m7, m18 ; 2 3 0123 89ab + mova m1, m8 + vpdpbusd m1, m2, m10 + vpermb m5, m20, m18 ; 2 3 4567 cdef + mova m2, m8 + vpdpbusd m2, m0, m10 + vpermb m17, m21, m17 ; 0 1 89ab ghij + mova m3, m8 + vpdpbusd m3, m4, m10 + vpermb m6, m7, m19 ; 4 5 0123 89ab + mova m4, m8 + vpdpbusd m4, m5, m10 + vpermb m18, m21, m18 ; 2 3 89ab ghij + vpdpbusd m1, m0, m11 + movu ym0, [srcq+ssq*0] ; 6 + vpdpbusd m2, m17, m11 + vpermb m17, m20, m19 ; 4 5 4567 cdef + vpdpbusd m3, m5, m11 + mova m5, m8 + vpdpbusd m5, m6, m10 + mova m6, m8 + vpdpbusd m6, m17, m10 + vpdpbusd m4, m18, m11 + mova m18, [spel_hv_perm16d] + vpermb m18, m18, m0 ; 6 0145 2367 89cd abef + vpdpbusd m5, m17, m11 + vpermb m19, m21, m19 ; 4 5 89ab ghij + mova m17, m8 + vpdpbusd m17, m18, m10 + mova m18, [spel_hv_perm16e] + vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij + packssdw m1, m2 ; 01 + vpdpbusd m6, m19, m11 + packssdw m3, m4 ; 23 + vpdpbusd m17, m0, m11 + psraw m1, 2 + packssdw m5, m6 ; 45 + psraw m3, 2 + vpshrdd m2, m1, m3, 16 ; 12 + psraw m5, 2 + vpshrdd m4, m3, m5, 16 ; 34 + psraw m17, 2 + vpshrdd m6, m5, m17, 16 ; 56 +.hv_w16_loop: + movu ym18, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + vinserti32x8 m18, [srcq+ssq*0], 1 + mova m0, m9 + vpdpwssd m0, m1, m12 ; a0 + vpermb m1, m7, m18 ; 7 8 0123 89ab + mova m17, m9 + vpdpwssd m17, m2, m12 ; b0 + vpermb m2, m20, m18 ; 7 8 4567 cdef + mova m19, m8 + vpdpbusd m19, m1, m10 + vpermb m18, m21, m18 + mova m1, m8 + vpdpbusd m1, m2, m10 + vpdpwssd m0, m3, m13 ; a1 + vpdpwssd m17, m4, m13 ; b1 + vpdpbusd m19, m2, m11 + mova m2, m4 + vpdpbusd m1, m18, m11 + mova m4, m6 + vpdpwssd m0, m5, m14 ; a2 + vpdpwssd m17, m6, m14 ; b2 + packssdw m19, m1 + mova m1, m3 + mova m3, m5 + psraw m6, m19, 2 ; 7 8 + vpshrdd m5, m4, m6, 16 ; 6 7 + vpdpwssd m17, m6, m15 ; b3 + vpdpwssd m0, m5, m15 ; a3 + packuswb m0, m17 + vpermb zmm1, m16, m0 + mova [dstq+dsq*0], xmm1 + vextracti128 [dstq+dsq*1], ymm1, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w16_loop + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .hv_w16_loop0 + vzeroupper + RET + +%macro PREP_8TAP_H 0 + vpermb m10, m5, m0 + vpermb m11, m5, m1 + vpermb m12, m6, m0 + vpermb m13, m6, m1 + vpermb m14, m7, m0 + vpermb m15, m7, m1 + mova m0, m4 + vpdpbusd m0, m10, m8 + mova m2, m4 + vpdpbusd m2, m12, m8 + mova m1, m4 + vpdpbusd m1, m11, m8 + mova m3, m4 + vpdpbusd m3, m13, m8 + vpdpbusd m0, m12, m9 + vpdpbusd m2, m14, m9 + vpdpbusd m1, m13, m9 + vpdpbusd m3, m15, m9 + packssdw m0, m2 + packssdw m1, m3 + psraw m0, 2 + psraw m1, 2 + mova [tmpq+64*0], m0 + mova [tmpq+64*1], m1 +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%define PREP_8TAP_FN FN prep_8tap, + +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN regular, REGULAR, REGULAR + +cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + lea r7, [prep_avx512icl] + movsxd wq, wm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + tzcnt wd, wd + movzx wd, word [r7+wq*2+table_offset(prep,)] + add wq, r7 + lea r6, [strideq*3] +%if WIN64 + pop r7 +%endif + jmp wq +.h: + test myd, 0xf00 + jnz .hv + vpbroadcastd m4, [pd_2] + WIN64_SPILL_XMM 10 + cmp wd, 4 + je .h_w4 + tzcnt wd, wd + shr mxd, 16 + sub srcq, 3 + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0] + vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4] + add wq, r7 + jmp wq +.h_w4: + movzx mxd, mxb + vbroadcasti128 ym5, [subpel_h_shufA] + mov r3d, 0x4 + dec srcq + vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + kmovb k1, r3d + lea stride3q, [strideq*3] +.h_w4_loop: + movq xm2, [srcq+strideq*0] + movq xm3, [srcq+strideq*1] + vpbroadcastq ym2{k1}, [srcq+strideq*2] + vpbroadcastq ym3{k1}, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + pshufb ym2, ym5 + pshufb ym3, ym5 + mova ym0, ym4 + vpdpbusd ym0, ym2, ym6 + mova ym1, ym4 + vpdpbusd ym1, ym3, ym6 + packssdw ym0, ym1 + psraw ym0, 2 + mova [tmpq], ym0 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + vbroadcasti128 m5, [subpel_h_shufA] + vbroadcasti128 m6, [subpel_h_shufB] + vbroadcasti128 m7, [subpel_h_shufC] + lea stride3q, [strideq*3] +.h_w8_loop: + movu xmm3, [srcq+strideq*0] + vinserti128 ym3, ymm3, [srcq+strideq*1], 1 + vinserti128 m3, [srcq+strideq*2], 2 + vinserti128 m3, [srcq+stride3q ], 3 + lea srcq, [srcq+strideq*4] + pshufb m1, m3, m5 + pshufb m2, m3, m6 + mova m0, m4 + vpdpbusd m0, m1, m8 + mova m1, m4 + vpdpbusd m1, m2, m8 + pshufb m3, m7 + vpdpbusd m0, m2, m9 + vpdpbusd m1, m3, m9 + packssdw m0, m1 + psraw m0, 2 + mova [tmpq], m0 + add tmpq, 64 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + mova m5, [spel_h_perm16a] + mova m6, [spel_h_perm16b] + mova m7, [spel_h_perm16c] + lea stride3q, [strideq*3] +.h_w16_loop: + movu ym0, [srcq+strideq*0] + movu ym1, [srcq+strideq*2] + vinserti32x8 m0, [srcq+strideq*1], 1 + vinserti32x8 m1, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + PREP_8TAP_H + add tmpq, 64*2 + sub hd, 4 + jg .h_w16_loop + RET +.h_w32: + mova m5, [spel_h_perm32a] + mova m6, [spel_h_perm32b] + mova m7, [spel_h_perm32c] +.h_w32_loop: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + PREP_8TAP_H + add tmpq, 64*2 + sub hd, 2 + jg .h_w32_loop + RET +.h_w64: + xor r6d, r6d + jmp .h_start +.h_w128: + mov r6, -64*1 +.h_start: + mova m5, [spel_h_perm32a] + mova m6, [spel_h_perm32b] + mova m7, [spel_h_perm32c] + sub srcq, r6 + mov r5, r6 +.h_loop: + movu m0, [srcq+r6+32*0] + movu m1, [srcq+r6+32*1] + PREP_8TAP_H + add tmpq, 64*2 + add r6, 64 + jle .h_loop + add srcq, strideq + mov r6, r5 + dec hd + jg .h_loop + RET +.v: + movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. + shr myd, 16 ; Note that the code is 8-tap only, having + tzcnt wd, wd + cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 + cmove myd, mxd ; had a negligible effect on performance. + ; TODO: Would a 6-tap code path be worth it? + lea myq, [r7+myq*8+subpel_filters-prep_avx512icl] + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)] + add wq, r7 + lea stride3q, [strideq*3] + sub srcq, stride3q + vpbroadcastd m7, [pw_8192] + vpbroadcastw m8, [myq+0] + vpbroadcastw m9, [myq+2] + vpbroadcastw m10, [myq+4] + vpbroadcastw m11, [myq+6] + jmp wq +.v_w4: + movd xmm0, [srcq+strideq*0] + vpbroadcastd ymm1, [srcq+strideq*2] + vpbroadcastd xmm2, [srcq+strideq*1] + vpbroadcastd ymm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ + vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ + vpbroadcastd ymm0, [srcq+strideq*0] + vpbroadcastd ymm2, [srcq+strideq*1] + vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ + vpbroadcastd ymm0, [srcq+strideq*2] + vbroadcasti128 ymm5, [deint_shuf4] + vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 + vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 + vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ + punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 + vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 + punpckhbw ymm2, ymm3 ; 23 34 45 56 +.v_w4_loop: + pinsrd xmm0, [srcq+stride3q ], 1 + lea srcq, [srcq+strideq*4] + vpbroadcastd ymm3, [srcq+strideq*0] + vpbroadcastd ymm4, [srcq+strideq*1] + vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ + vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ + vpbroadcastd ymm0, [srcq+strideq*2] + vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ + pshufb ymm3, ymm5 ; 67 78 89 9a + pmaddubsw ymm4, ymm1, ym8 + vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 + pmaddubsw ymm2, ym9 + paddw ymm4, ymm2 + mova ymm2, ymm3 + pmaddubsw ymm3, ym11 + paddw ymm3, ymm4 + pmaddubsw ymm4, ymm1, ym10 + paddw ymm3, ymm4 + pmulhrsw ymm3, ym7 + mova [tmpq], ymm3 + add tmpq, 32 + sub hd, 4 + jg .v_w4_loop + vzeroupper + RET +.v_w8: + mov r3d, 0xf044 + kmovw k1, r3d + kshiftrw k2, k1, 8 + movq xm0, [srcq+strideq*0] + vpbroadcastq ym1, [srcq+strideq*1] + vpbroadcastq m2, [srcq+strideq*2] + vpbroadcastq m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m4, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + vpbroadcastq m6, [srcq+strideq*2] + vmovdqa64 ym0{k1}, ym1 + vmovdqa64 ym1{k1}, ym2 + vmovdqa64 m2{k1}, m3 + vmovdqa64 m3{k1}, m4 + vmovdqa64 m4{k1}, m5 + vmovdqa64 m5{k1}, m6 + punpcklbw ym0, ym1 ; 01 12 __ __ + punpcklbw m2, m3 ; 23 34 23 34 + punpcklbw m4, m5 ; 45 56 45 56 + vmovdqa64 m0{k2}, m2 ; 01 12 23 34 + vmovdqa64 m2{k2}, m4 ; 23 34 45 56 +.v_w8_loop: + vpbroadcastq m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3, [srcq+strideq*0] + vpbroadcastq m5, [srcq+strideq*1] + pmaddubsw m14, m0, m8 + pmaddubsw m15, m2, m9 + vpblendmq m0{k1}, m6, m1 + vpblendmq m2{k1}, m1, m3 + vpbroadcastq m6, [srcq+strideq*2] + paddw m14, m15 + punpcklbw m2, m0, m2 ; 67 78 67 78 + vpblendmq m12{k1}, m3, m5 + vpblendmq m13{k1}, m5, m6 + vpblendmq m0{k2}, m4, m2 ; 45 56 67 78 + punpcklbw m4, m12, m13 ; 89 9a 89 9a + vmovdqa64 m2{k2}, m4 ; 67 78 89 9a + pmaddubsw m12, m0, m10 + pmaddubsw m13, m2, m11 + paddw m14, m12 + paddw m14, m13 + pmulhrsw m14, m7 + mova [tmpq], m14 + add tmpq, 64 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + mov r3d, 0xf0 + kmovb k1, r3d + vbroadcasti128 m0, [srcq+strideq*0] + vbroadcasti128 m1, [srcq+strideq*1] + vbroadcasti128 m2, [srcq+strideq*2] + vbroadcasti128 m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vbroadcasti128 m4, [srcq+strideq*0] + vbroadcasti128 m5, [srcq+strideq*1] + vbroadcasti128 m6, [srcq+strideq*2] + vmovdqa64 m0{k1}, m1 + vmovdqa64 m1{k1}, m2 + vmovdqa64 m2{k1}, m3 + vmovdqa64 m3{k1}, m4 + vmovdqa64 m4{k1}, m5 + vmovdqa64 m5{k1}, m6 + shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b + shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b + shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_-- + shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_-- + punpckhbw m2, m0, m1 ; 23a 23b 34a 34b + punpcklbw m0, m1 ; 01a 01b 12a 12b + punpcklbw m4, m5 ; 45a 45b 56a 56b +.v_w16_loop: + vbroadcasti128 m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vbroadcasti128 m5, [srcq+strideq*0] + vpblendmq m1{k1}, m6, m3 + vmovdqa64 m3{k1}, m5 + pmaddubsw m12, m0, m8 + pmaddubsw m13, m2, m8 + pmaddubsw m14, m2, m9 + pmaddubsw m15, m4, m9 + pmaddubsw m0, m4, m10 + vbroadcasti128 m2, [srcq+strideq*1] + vbroadcasti128 m6, [srcq+strideq*2] + paddw m12, m14 + paddw m13, m15 + paddw m12, m0 + vmovdqa64 m5{k1}, m2 + vmovdqa64 m2{k1}, m6 + mova m0, m4 + shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b + shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab + punpcklbw m2, m1, m3 ; 67a 67b 78a 78b + punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab + pmaddubsw m14, m2, m10 + pmaddubsw m15, m2, m11 + paddw m13, m14 + paddw m12, m15 + pmaddubsw m14, m4, m11 + paddw m13, m14 + pmulhrsw m12, m7 + pmulhrsw m13, m7 + mova [tmpq+ 0], m12 + mova [tmpq+64], m13 + add tmpq, 64*2 + sub hd, 4 + jg .v_w16_loop + RET +.v_w32: + mova m18, [bilin_v_perm64] + movu ym0, [srcq+strideq*0] + movu ym1, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym2, [srcq+strideq*0] + movu ym3, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym4, [srcq+strideq*0] + movu ym5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym6, [srcq+strideq*0] + vpermq m0, m18, m0 + vpermq m1, m18, m1 + vpermq m2, m18, m2 + vpermq m3, m18, m3 + vpermq m4, m18, m4 + vpermq m5, m18, m5 + vpermq m6, m18, m6 + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + punpcklbw m3, m4 + punpcklbw m4, m5 + punpcklbw m5, m6 +.v_w32_loop: + movu ym12, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu ym13, [srcq+strideq*0] + pmaddubsw m14, m0, m8 + pmaddubsw m16, m2, m9 + pmaddubsw m15, m1, m8 + pmaddubsw m17, m3, m9 + mova m0, m2 + mova m1, m3 + vpermq m12, m18, m12 + vpermq m13, m18, m13 + paddw m14, m16 + paddw m15, m17 + pmaddubsw m16, m4, m10 + pmaddubsw m17, m5, m10 + punpcklbw m6, m12 + punpcklbw m12, m13 + mova m2, m4 + mova m3, m5 + paddw m14, m16 + paddw m15, m17 + pmaddubsw m16, m6, m11 + pmaddubsw m17, m12, m11 + mova m4, m6 + mova m5, m12 + paddw m14, m16 + paddw m15, m17 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + mova m6, m13 + mova [tmpq+ 0], m14 + mova [tmpq+64], m15 + add tmpq, 64*2 + sub hd, 2 + jg .v_w32_loop + vzeroupper + RET +.v_w64: + mov wd, 64 + jmp .v_start +.v_w128: + mov wd, 128 +.v_start: + WIN64_SPILL_XMM 27 + mova m26, [bilin_v_perm64] + lea r6d, [hq+wq*2] + mov r5, srcq + mov r7, tmpq +.v_loop0: + vpermq m0, m26, [srcq+strideq*0] + vpermq m1, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m2, m26, [srcq+strideq*0] + vpermq m3, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m4, m26, [srcq+strideq*0] + vpermq m5, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m6, m26, [srcq+strideq*0] + punpckhbw m12, m0, m1 + punpcklbw m0, m1 + punpckhbw m13, m1, m2 + punpcklbw m1, m2 + punpckhbw m14, m2, m3 + punpcklbw m2, m3 + punpckhbw m15, m3, m4 + punpcklbw m3, m4 + punpckhbw m16, m4, m5 + punpcklbw m4, m5 + punpckhbw m17, m5, m6 + punpcklbw m5, m6 +.v_loop: + vpermq m18, m26, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vpermq m19, m26, [srcq+strideq*0] + pmaddubsw m20, m0, m8 + pmaddubsw m21, m12, m8 + pmaddubsw m22, m1, m8 + pmaddubsw m23, m13, m8 + mova m0, m2 + mova m12, m14 + mova m1, m3 + mova m13, m15 + pmaddubsw m2, m9 + pmaddubsw m14, m9 + pmaddubsw m3, m9 + pmaddubsw m15, m9 + punpckhbw m24, m6, m18 + punpcklbw m6, m18 + paddw m20, m2 + paddw m21, m14 + paddw m22, m3 + paddw m23, m15 + mova m2, m4 + mova m14, m16 + mova m3, m5 + mova m15, m17 + pmaddubsw m4, m10 + pmaddubsw m16, m10 + pmaddubsw m5, m10 + pmaddubsw m17, m10 + punpckhbw m25, m18, m19 + punpcklbw m18, m19 + paddw m20, m4 + paddw m21, m16 + paddw m22, m5 + paddw m23, m17 + mova m4, m6 + mova m16, m24 + mova m5, m18 + mova m17, m25 + pmaddubsw m6, m11 + pmaddubsw m24, m11 + pmaddubsw m18, m11 + pmaddubsw m25, m11 + paddw m20, m6 + paddw m21, m24 + paddw m22, m18 + paddw m23, m25 + pmulhrsw m20, m7 + pmulhrsw m21, m7 + pmulhrsw m22, m7 + pmulhrsw m23, m7 + mova m6, m19 + mova [tmpq+wq*0+ 0], m20 + mova [tmpq+wq*0+64], m21 + mova [tmpq+wq*2+ 0], m22 + mova [tmpq+wq*2+64], m23 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_loop + add r5, 64 + add r7, 128 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .v_loop0 + RET +.hv: + %assign stack_offset stack_offset - stack_size_padded + %assign stack_size_padded 0 + WIN64_SPILL_XMM 16 + cmp wd, 4 + je .hv_w4 + shr mxd, 16 + sub srcq, 3 + vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0] + vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + tzcnt wd, wd + vpbroadcastd m8, [pd_2] + movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)] + vpbroadcastd m9, [pd_32] + add wq, r7 + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] + lea stride3q, [strideq*3] + sub srcq, stride3q + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + jmp wq +.hv_w4: + movzx mxd, mxb + dec srcq + vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2] + movzx mxd, myb + shr myd, 16 + cmp hd, 4 + cmove myd, mxd + vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] + lea stride3q, [strideq*3] + sub srcq, stride3q + mov r3d, 0x04 + kmovb k1, r3d + kshiftlb k2, k1, 2 + kshiftlb k3, k1, 4 + vpbroadcastd m10, [pd_2] + vbroadcasti128 m16, [subpel_h_shufA] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + vpbroadcastd m11, [pd_32] + pshufd m12, m0, q0000 + pshufd m13, m0, q1111 + pshufd m14, m0, q2222 + pshufd m15, m0, q3333 + movq xm3, [srcq+strideq*0] + vpbroadcastq ym2, [srcq+strideq*1] + vpbroadcastq ym3{k1}, [srcq+strideq*2] + vpbroadcastq m2{k2}, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vpbroadcastq m3{k2}, [srcq+strideq*0] + vpbroadcastq m2{k3}, [srcq+strideq*1] + vpbroadcastq m3{k3}, [srcq+strideq*2] + mova m17, [spel_hv_perm4a] + movu m18, [spel_hv_perm4b] + mova m0, m10 + mova m1, m10 + pshufb m2, m16 + pshufb m3, m16 + vpdpbusd m0, m2, m8 + vpdpbusd m1, m3, m8 + packssdw m0, m1 ; _ 0 1 2 3 4 5 6 + psraw m0, 2 + vpermb m1, m17, m0 ; 01 12 23 34 + vpermb m2, m18, m0 ; 23 34 45 56 +.hv_w4_loop: + movq xm3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + movq xm4, [srcq+strideq*0] + vpbroadcastq ym3{k1}, [srcq+strideq*1] + vpbroadcastq ym4{k1}, [srcq+strideq*2] + mova ym5, ym10 + mova ym6, ym10 + pshufb ym3, ym16 + pshufb ym4, ym16 + vpdpbusd ym5, ym3, ym8 + vpdpbusd ym6, ym4, ym8 + mova m7, m11 + packssdw ym5, ym6 ; 7 8 9 a _ _ _ _ + psraw ym5, 2 + valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a + vpdpwssd m7, m1, m12 + vpdpwssd m7, m2, m13 + vpermb m1, m17, m0 ; 45 56 67 78 + vpermb m2, m18, m0 ; 67 78 89 9a + vpdpwssd m7, m1, m14 + vpdpwssd m7, m2, m15 + psrad m7, 6 + vpmovdw [tmpq], m7 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + vzeroupper + RET +.hv_w8: + WIN64_SPILL_XMM 24 + vbroadcasti128 m16, [subpel_h_shufA] + vbroadcasti128 m17, [subpel_h_shufB] + vbroadcasti128 m18, [subpel_h_shufC] + vinserti128 ym0, [srcq+strideq*0], 1 + vinserti128 m0, [srcq+strideq*1], 2 + vinserti128 m0, [srcq+strideq*2], 3 + movu xm1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 ym1, [srcq+strideq*0], 1 + vinserti128 m1, [srcq+strideq*1], 2 + vinserti128 m1, [srcq+strideq*2], 3 + mova m2, m8 + mova m4, m8 + mova m3, m8 + mova m5, m8 + pshufb m20, m0, m16 + pshufb m21, m0, m17 + pshufb m22, m0, m18 + pshufb m23, m1, m16 + pshufb m6, m1, m17 + pshufb m7, m1, m18 + vpdpbusd m2, m20, m10 + vpdpbusd m4, m21, m10 + vpdpbusd m2, m21, m11 + vpdpbusd m4, m22, m11 + vpdpbusd m3, m23, m10 + vpdpbusd m5, m6, m10 + vpdpbusd m3, m6, m11 + vpdpbusd m5, m7, m11 + packssdw m2, m4 + packssdw m3, m5 + psraw m2, 2 ; _ 0 1 2 + psraw m3, 2 ; 3 4 5 6 + valignq m0, m3, m2, 2 ; 0 1 2 3 + valignq m1, m3, m2, 4 ; 1 2 3 4 + valignq m2, m3, m2, 6 ; 2 3 4 5 + punpcklwd m4, m0, m1 ; 01a 12a 23a 34a + punpckhwd m5, m0, m1 ; 01b 12b 23b 34b + punpcklwd m6, m2, m3 ; 23a 34a 45a 56a + punpckhwd m7, m2, m3 ; 23b 34b 45b 56b +.hv_w8_loop: + movu xm19, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + vinserti128 ym19, [srcq+strideq*0], 1 + vinserti128 m19, [srcq+strideq*1], 2 + vinserti128 m19, [srcq+strideq*2], 3 + mova m20, m9 + mova m21, m9 + mova m22, m8 + mova m23, m8 + vpdpwssd m20, m4, m12 + vpdpwssd m21, m5, m12 + vpdpwssd m20, m6, m13 + vpdpwssd m21, m7, m13 + pshufb m0, m19, m16 + pshufb m1, m19, m17 + pshufb m2, m19, m18 + vpdpbusd m22, m0, m10 + vpdpbusd m23, m1, m10 + vpdpbusd m22, m1, m11 + vpdpbusd m23, m2, m11 + packssdw m22, m23 + psraw m22, 2 ; 7 8 9 A + valignq m0, m22, m3, 2 ; 4 5 6 7 + valignq m1, m22, m3, 4 ; 5 6 7 8 + valignq m2, m22, m3, 6 ; 6 7 8 9 + mova m3, m22 + punpcklwd m4, m0, m1 ; 45a 56a 67a 78a + punpckhwd m5, m0, m1 ; 45b 56b 67b 78b + punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa + punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab + vpdpwssd m20, m4, m14 + vpdpwssd m21, m5, m14 + vpdpwssd m20, m6, m15 + vpdpwssd m21, m7, m15 + psrad m20, 6 + psrad m21, 6 + packssdw m20, m21 + mova [tmpq], m20 + add tmpq, 64 + sub hd, 4 + jg .hv_w8_loop + RET +.hv_w16: + mov wd, 16*2 + jmp .hv_start +.hv_w32: + mov wd, 32*2 + jmp .hv_start +.hv_w64: + mov wd, 64*2 + jmp .hv_start +.hv_w128: + mov wd, 128*2 +.hv_start: + WIN64_SPILL_XMM 31 + mova m16, [spel_h_perm16a] + mova m17, [spel_h_perm16b] + mova m18, [spel_h_perm16c] + lea r6d, [hq+wq*8-256] + mov r5, srcq + mov r7, tmpq +.hv_loop0: + movu ym0, [srcq+strideq*0] + vinserti32x8 m0, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + movu ym1, [srcq+strideq*0] + vinserti32x8 m1, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + movu ym2, [srcq+strideq*0] + vinserti32x8 m2, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] + movu ym3, [srcq+strideq*0] + mova m4, m8 + mova m5, m8 + mova m6, m8 + mova m7, m8 + vpermb m19, m16, m0 + vpermb m20, m17, m0 + vpermb m21, m18, m0 + vpermb m22, m16, m1 + vpermb m23, m17, m1 + vpermb m24, m18, m1 + vpermb m25, m16, m2 + vpermb m26, m17, m2 + vpermb m27, m18, m2 + vpermb ym28, ym16, ym3 + vpermb ym29, ym17, ym3 + vpermb ym30, ym18, ym3 + mova m0, m8 + mova m1, m8 + mova ym2, ym8 + mova ym3, ym8 + vpdpbusd m4, m19, m10 + vpdpbusd m5, m20, m10 + vpdpbusd m6, m22, m10 + vpdpbusd m7, m23, m10 + vpdpbusd m0, m25, m10 + vpdpbusd m1, m26, m10 + vpdpbusd ym2, ym28, ym10 + vpdpbusd ym3, ym29, ym10 + vpdpbusd m4, m20, m11 + vpdpbusd m5, m21, m11 + vpdpbusd m6, m23, m11 + vpdpbusd m7, m24, m11 + vpdpbusd m0, m26, m11 + vpdpbusd m1, m27, m11 + vpdpbusd ym2, ym29, ym11 + vpdpbusd ym3, ym30, ym11 + packssdw m4, m5 + packssdw m6, m7 + packssdw m0, m1 + packssdw ym2, ym3 + psraw m4, 2 ; 0a 0b 1a 1b + psraw m6, 2 ; 2a 2b 3a 3b + psraw m0, 2 ; 4a 4b 5a 5b + psraw ym2, 2 ; 6a 6b __ __ + vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b + vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b + vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b + punpcklwd m2, m4, m5 ; 01a 01c 12a 12c + punpckhwd m3, m4, m5 ; 01b 01d 12b 12d + punpcklwd m4, m6, m7 ; 23a 23c 34a 34c + punpckhwd m5, m6, m7 ; 23b 23d 34b 34d + punpcklwd m6, m0, m1 ; 45a 45c 56a 56c + punpckhwd m7, m0, m1 ; 45b 45d 56b 56d +.hv_loop: + movu ym19, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + vinserti32x8 m19, [srcq+strideq*0], 1 + mova m20, m9 + mova m21, m9 + mova m22, m8 + mova m23, m8 + vpdpwssd m20, m2, m12 + vpdpwssd m21, m3, m12 + vpdpwssd m20, m4, m13 + vpdpwssd m21, m5, m13 + vpermb m24, m16, m19 + vpermb m25, m17, m19 + vpermb m26, m18, m19 + vpdpbusd m22, m24, m10 + vpdpbusd m23, m25, m10 + vpdpbusd m22, m25, m11 + vpdpbusd m23, m26, m11 + packssdw m22, m23 + psraw m22, 2 ; 7a 7b 8a 8b + vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b + mova m2, m4 + mova m3, m5 + mova m1, m22 + mova m4, m6 + mova m5, m7 + punpcklwd m6, m0, m1 ; 67a 67c 78a 78c + punpckhwd m7, m0, m1 ; 67b 67d 78b 78d + vpdpwssd m20, m4, m14 + vpdpwssd m21, m5, m14 + vpdpwssd m20, m6, m15 + vpdpwssd m21, m7, m15 + psrad m20, 6 + psrad m21, 6 + packssdw m20, m21 + mova [tmpq+wq*0], ym20 + vextracti32x8 [tmpq+wq*1], m20, 1 + lea tmpq, [tmpq+wq*2] + sub hd, 2 + jg .hv_loop + add r5, 16 + add r7, 32 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r7 + sub r6d, 1<<8 + jg .hv_loop0 + RET + +cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts + vpbroadcastd m9, [pd_16384] + mova ym15, [warp_8x8t_end] + call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main + jmp .start +.loop: + call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2 + lea tmpq, [tmpq+tsq*4] +.start: + paddd m16, m16 + vpermb m16, m15, m16 + mova [tmpq+tsq*0], xm16 + vextracti128 [tmpq+tsq*2], ym16, 1 + sub r6d, 0x1800 + jg .loop + RET + +cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter + vpbroadcastd m9, [pd_262144] + mova xm15, [warp_8x8_end] + call .main + jmp .start +.loop: + call .main2 + lea dstq, [dstq+dsq*2] +.start: + psrad m16, 19 + packuswb m16, m16 + vpermb m16, m15, m16 + movq [dstq+dsq*0], xm16 + movhps [dstq+dsq*1], xm16 + sub r6d, 0x1800 + jg .loop + RET +ALIGN function_align +.main: + vpbroadcastd m1, [pd_512] +%if WIN64 + mov abcdq, r5mp + vpaddd ym18, ym1, r6m {1to8} ; mx +%else + add r5d, 512 + vpbroadcastd ym18, r5d +%endif + vpaddd ym20, ym1, r7m {1to8} ; my + mova ym16, [pd_0to7] + vpbroadcastd ym19, [abcdq+4*0] + vpbroadcastd ym21, [abcdq+4*1] + lea r4, [ssq*3+3] + mova m10, [warp_8x8_permA] + mov r6d, 0x5555 + mova m11, [warp_8x8_permB] + lea filterq, [mc_warp_filter+64*8] + vpbroadcastq m12, [warp_8x8_hpack] + sub srcq, r4 ; src -= src_stride*3 + 3 + vbroadcasti32x4 m13, [warp_8x8_permC] + kxnorb k2, k2, k2 + vbroadcasti32x4 m14, [warp_8x8_permD] + vpdpwssd ym18, ym19, ym16 ; alpha + vpdpwssd ym20, ym21, ym16 ; gamma + vbroadcasti32x4 m0, [srcq] + psrad ym19, 16 ; beta + psrad ym21, 16 ; delta + kmovw k1, r6d + psrad ym16, ym18, 10 + kmovb k3, k2 + paddd ym18, ym19 + vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0 + psrld m1, 8 ; pd_2 + pshufb m0, m11 + paddd m8, m1, m1 ; pd_4 + vpdpbusd m1, m0, m2 + call .h + psllq m2, m1, 45 + pslld m1, 13 + paddd m1, m2 + vpshrdq m1, m0, 48 ; 01 12 + call .h + vpshrdq m2, m1, m0, 48 ; 23 34 + call .h + vpshrdq m3, m2, m0, 48 ; 45 56 +.main2: + call .h + psrad ym17, ym20, 10 + kmovb k2, k3 + paddd ym20, ym21 + vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0 + psrad ym16, ym20, 10 + kmovb k3, k2 + paddd ym20, ym21 + vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1 + shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3 + mova m16, m9 + pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1 + vpdpwssd m16, m1, m4 + pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3 + mova m1, m2 + vpdpwssd m16, m2, m5 + shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7 + mova m2, m3 + pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5 + vpdpwssd m16, m3, m4 + vpshrdq m3, m0, 48 ; 67 78 + pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7 + vpdpwssd m16, m3, m5 + ret +ALIGN function_align +.h: + movu xm5, [srcq+ssq*1] + psrad ym16, ym18, 10 + lea srcq, [srcq+ssq*2] + vinserti32x4 ym5, [srcq+ssq*0], 1 + kmovb k2, k3 + paddd ym18, ym19 + vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1 + psrad ym17, ym18, 10 + kmovb k3, k2 + paddd ym18, ym19 + vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2 + mova m0, m8 + vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7 + vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3 + vpdpbusd m0, m4, m17 + vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb + vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7 + vpdpbusd m0, m5, m16 + vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3) + ret + +%macro BIDIR_FN 1 ; op + lea stride3q, [strideq*3] + jmp wq +.w4: + cmp hd, 8 + jg .w4_h16 + WRAP_YMM %1 0 + vextracti32x4 xm1, ym0, 1 + movd [dstq ], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + jl .w4_ret + lea dstq, [dstq+strideq*4] + pextrd [dstq ], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 +.w4_ret: + RET +.w4_h16: + vpbroadcastd m7, strided + pmulld m7, [bidir_sctr_w4] + %1 0 + kxnorw k1, k1, k1 + vpscatterdd [dstq+m7]{k1}, m0 + RET +.w8: + cmp hd, 4 + jne .w8_h8 + WRAP_YMM %1 0 + vextracti32x4 xm1, ym0, 1 + movq [dstq ], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + RET +.w8_loop: + %1_INC_PTR 2 + lea dstq, [dstq+strideq*4] +.w8_h8: + %1 0 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 + movq [dstq ], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + movhps [dstq ], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + %1_INC_PTR 2 + lea dstq, [dstq+strideq*4] +.w16: + %1 0 + vpermq m0, m0, q3120 + mova [dstq ], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m7, [pb_02461357] +.w32_loop: + %1 0 + %1_INC_PTR 2 + vpermq m0, m7, m0 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m7, [pb_02461357] +.w64_loop: + %1 0 + %1_INC_PTR 2 + vpermq m0, m7, m0 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +.w128: + pmovzxbq m7, [pb_02461357] +.w128_loop: + %1 0 + vpermq m6, m7, m0 + %1 2 + mova [dstq+64*0], m6 + %1_INC_PTR 4 + vpermq m6, m7, m0 + mova [dstq+64*1], m6 + add dstq, strideq + dec hd + jg .w128_loop + RET +%endmacro + +%macro AVG 1 ; src_offset + mova m0, [tmp1q+(%1+0)*mmsize] + paddw m0, [tmp2q+(%1+0)*mmsize] + mova m1, [tmp1q+(%1+1)*mmsize] + paddw m1, [tmp2q+(%1+1)*mmsize] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 +%endmacro + +%macro AVG_INC_PTR 1 + add tmp1q, %1*mmsize + add tmp2q, %1*mmsize +%endmacro + +cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-avg_avx512icl_table + lea r6, [avg_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r6+wq*4] + vpbroadcastd m4, [base+pw_1024] + add wq, r6 + BIDIR_FN AVG + +%macro W_AVG 1 ; src_offset + ; (a * weight + b * (16 - weight) + 128) >> 8 + ; = ((a - b) * weight + (b << 4) + 128) >> 8 + ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 + ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 + mova m0, [tmp1q+(%1+0)*mmsize] + psubw m2, m0, [tmp2q+(%1+0)*mmsize] + mova m1, [tmp1q+(%1+1)*mmsize] + psubw m3, m1, [tmp2q+(%1+1)*mmsize] + pmulhw m2, m4 + pmulhw m3, m4 + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%define W_AVG_INC_PTR AVG_INC_PTR + +cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 +%define base r6-w_avg_avx512icl_table + lea r6, [w_avg_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + vpbroadcastw m4, r6m ; weight + movsxd wq, dword [r6+wq*4] + vpbroadcastd m5, [base+pw_2048] + psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 + cmp dword r6m, 7 + jg .weight_gt7 + mov r6, tmp1q + pxor m0, m0 + mov tmp1q, tmp2q + psubw m4, m0, m4 ; -weight + mov tmp2q, r6 +.weight_gt7: + BIDIR_FN W_AVG + +%macro MASK 1 ; src_offset + ; (a * m + b * (64 - m) + 512) >> 10 + ; = ((a - b) * m + (b << 6) + 512) >> 10 + ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 +%if mmsize == 64 + vpermq m3, m8, [maskq+%1*32] +%else + vpermq m3, [maskq+%1*16], q3120 +%endif + mova m0, [tmp2q+(%1+0)*mmsize] + psubw m1, m0, [tmp1q+(%1+0)*mmsize] + psubb m3, m4, m3 + paddw m1, m1 ; (b - a) << 1 + paddb m3, m3 + punpcklbw m2, m4, m3 ; -m << 9 + pmulhw m1, m2 + paddw m0, m1 + mova m1, [tmp2q+(%1+1)*mmsize] + psubw m2, m1, [tmp1q+(%1+1)*mmsize] + paddw m2, m2 + punpckhbw m3, m4, m3 + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%macro MASK_INC_PTR 1 + add maskq, %1*32 + add tmp2q, %1*64 + add tmp1q, %1*64 +%endmacro + +cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-mask_avx512icl_table + lea r7, [mask_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + mov maskq, maskmp + movsxd wq, dword [r7+wq*4] + pxor m4, m4 + mova m8, [base+bilin_v_perm64] + vpbroadcastd m5, [base+pw_2048] + add wq, r7 + BIDIR_FN MASK + +%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 + mova m%1, [tmp1q+mmsize*%3] + mova m1, [tmp2q+mmsize*%3] + psubw m1, m%1 + pabsw m%2, m1 + psubusw m%2, m6, m%2 + psrlw m%2, 8 ; 64 - m + psllw m2, m%2, 10 + pmulhw m1, m2 + paddw m%1, m1 + mova m1, [tmp1q+mmsize*%4] + mova m2, [tmp2q+mmsize*%4] + psubw m2, m1 + pabsw m3, m2 + psubusw m3, m6, m3 + vpshldw m%2, m3, 8 + psllw m3, m%2, 10 +%if %5 + psubb m%2, m5, m%2 +%endif + pmulhw m2, m3 + paddw m1, m2 + pmulhrsw m%1, m7 + pmulhrsw m1, m7 + packuswb m%1, m1 +%endmacro + +cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_420_avx512icl_table + lea r7, [w_mask_420_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + movsxd wq, [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + vpbroadcastd m9, [base+pb_m64] ; -1 << 6 + mova ym10, [base+wm_420_mask+32] + vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6 + add wq, r7 + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + mova m5, [wm_420_perm4] + cmp hd, 8 + jg .w4_h16 + WRAP_YMM W_MASK 0, 4, 0, 1 + vinserti128 ym5, [wm_420_perm4+32], 1 + vpermb ym4, ym5, ym4 + vpdpbusd ym8, ym4, ym9 + vextracti32x4 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 +.w4_end: + vpermb ym8, ym10, ym8 + movq [maskq], xm8 + RET +.w4_h16: + vpbroadcastd m11, strided + pmulld m11, [bidir_sctr_w4] + W_MASK 0, 4, 0, 1 + vpermb m4, m5, m4 + vpdpbusd m8, m4, m9 + kxnorw k1, k1, k1 + vpermb m8, m10, m8 + mova [maskq], xm8 + vpscatterdd [dstq+m11]{k1}, m0 + RET +.w8: + mova m5, [wm_420_perm8] + cmp hd, 4 + jne .w8_h8 + WRAP_YMM W_MASK 0, 4, 0, 1 + vinserti128 ym5, [wm_420_perm8+32], 1 + vpermb ym4, ym5, ym4 + vpdpbusd ym8, ym4, ym9 + vpermb m8, m10, m8 + mova [maskq], xm8 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + RET +.w8_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 16 + lea dstq, [dstq+strideq*4] +.w8_h8: + W_MASK 0, 4, 0, 1 + vpermb m4, m5, m4 + mova m1, m8 + vpdpbusd m1, m4, m9 + vpermb m1, m10, m1 + mova [maskq], xm1 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 + sub hd, 8 + jg .w8_loop + RET +.w16: + mova m5, [wm_420_perm16] +.w16_loop: + W_MASK 0, 4, 0, 1 + vpermb m4, m5, m4 + mova m1, m8 + vpdpbusd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m0, q3120 + mova [maskq], xm1 + add maskq, 16 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m5, [pb_02461357] +.w32_loop: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpbusd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m5, m0 + mova [maskq], xm1 + add maskq, 16 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14 + psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15 +.w64_loop: + W_MASK 0, 4, 0, 2 + W_MASK 11, 5, 1, 3 + mova m2, m8 + vpdpbusd m2, m4, m9 + mova m3, m8 + vpdpbusd m3, m5, m9 + add tmp1q, 256 + add tmp2q, 256 + vpermt2b m2, m10, m3 + mova m1, m0 + vpermt2q m0, m12, m11 + vpermt2q m1, m13, m11 + mova [maskq], ym2 + add maskq, 32 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w64_loop + RET +.w128: + pmovzxbq m14, [wm_420_perm64] + mova m10, [wm_420_mask] + psrlq m15, m14, 4 +.w128_loop: + W_MASK 0, 12, 0, 4 + W_MASK 11, 13, 1, 5 + mova m4, m8 + vpdpbusd m4, m12, m9 + mova m5, m8 + vpdpbusd m5, m13, m9 + mova m1, m0 + vpermt2q m0, m14, m11 + vpermt2q m1, m15, m11 + mova [dstq+strideq*0+64*0], m0 + mova [dstq+strideq*1+64*0], m1 + W_MASK 0, 12, 2, 6 + W_MASK 11, 13, 3, 7 + vprold m4, 16 + vprold m5, 16 + vpdpbusd m4, m12, m9 + vpdpbusd m5, m13, m9 + add tmp1q, 512 + add tmp2q, 512 + vpermt2b m4, m10, m5 + mova m1, m0 + vpermt2q m0, m14, m11 + vpermt2q m1, m15, m11 + mova [maskq], m4 + add maskq, 64 + mova [dstq+strideq*0+64*1], m0 + mova [dstq+strideq*1+64*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w128_loop + RET + +cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_422_avx512icl_table + lea r7, [w_mask_422_avx512icl_table] + tzcnt wd, wm + mov r6d, r7m ; sign + movifnidn hd, hm + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m7, [base+pw_2048] + vpbroadcastd m9, [base+pw_m128] + mova m10, [base+wm_422_mask] + vpbroadcastd m11, [base+pb_127] + add wq, r7 + vpbroadcastd m8, [base+wm_sign+4+r6*4] + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + cmp hd, 8 + jg .w4_h16 + WRAP_YMM W_MASK 0, 4, 0, 1 + movhps xm10, [wm_422_mask+16] + vpdpwssd ym8, ym4, ym9 + vpermb ym8, ym10, ym8 + vextracti32x4 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 +.w4_end: + pand xm8, xm11 + mova [maskq], xm8 + RET +.w4_h16: + vpbroadcastd m5, strided + pmulld m5, [bidir_sctr_w4] + W_MASK 0, 4, 0, 1 + vpdpwssd m8, m4, m9 + kxnorw k1, k1, k1 + vpermb m8, m10, m8 + pand ym8, ym11 + mova [maskq], ym8 + vpscatterdd [dstq+m5]{k1}, m0 + RET +.w8: + cmp hd, 4 + jne .w8_h8 + WRAP_YMM W_MASK 0, 4, 0, 1 + movhps xm10, [wm_422_mask+16] + vpdpwssd ym8, ym4, ym9 + vpermb ym8, ym10, ym8 + pand xm8, xm11 + mova [maskq], xm8 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + RET +.w8_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 32 + lea dstq, [dstq+strideq*4] +.w8_h8: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + vpermb m1, m10, m1 + pand ym1, ym11 + mova [maskq], ym1 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 32 + lea dstq, [dstq+strideq*4] +.w16: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + vpermb m1, m10, m1 + vpermq m0, m0, q3120 + pand ym1, ym11 + mova [maskq], ym1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m5, [pb_02461357] +.w32_loop: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m5, m0 + pand ym1, ym11 + mova [maskq], ym1 + add maskq, 32 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m5, [pb_02461357] +.w64_loop: + W_MASK 0, 4, 0, 1 + mova m1, m8 + vpdpwssd m1, m4, m9 + add tmp1q, 128 + add tmp2q, 128 + vpermb m1, m10, m1 + vpermq m0, m5, m0 + pand ym1, ym11 + mova [maskq], ym1 + add maskq, 32 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +.w128: + pmovzxbq m13, [pb_02461357] +.w128_loop: + W_MASK 0, 4, 0, 1 + W_MASK 12, 5, 2, 3 + mova m2, m8 + vpdpwssd m2, m4, m9 + mova m3, m8 + vpdpwssd m3, m5, m9 + add tmp1q, 256 + add tmp2q, 256 + vpermt2b m2, m10, m3 + vpermq m0, m13, m0 + vpermq m1, m13, m12 + pand m2, m11 + mova [maskq], m2 + add maskq, 64 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + dec hd + jg .w128_loop + RET + +cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 +%define base r7-w_mask_444_avx512icl_table + lea r7, [w_mask_444_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r7+wq*4] + vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + vpbroadcastd m5, [base+pb_64] + vpbroadcastd m7, [base+pw_2048] + mova m8, [base+wm_444_mask] + add wq, r7 + mov maskq, maskmp + lea stride3q, [strideq*3] + jmp wq +.w4: + cmp hd, 8 + jg .w4_h16 + WRAP_YMM W_MASK 0, 4, 0, 1, 1 + vinserti128 ym8, [wm_444_mask+32], 1 + vpermb ym4, ym8, ym4 + mova [maskq], ym4 + vextracti32x4 xm1, m0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 + jl .w4_end + lea dstq, [dstq+strideq*4] + pextrd [dstq+strideq*0], xm0, 2 + pextrd [dstq+strideq*1], xm0, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 +.w4_end: + RET +.w4_h16: + vpbroadcastd m9, strided + pmulld m9, [bidir_sctr_w4] + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + kxnorw k1, k1, k1 + mova [maskq], m4 + vpscatterdd [dstq+m9]{k1}, m0 + RET +.w8: + cmp hd, 4 + jne .w8_h8 + WRAP_YMM W_MASK 0, 4, 0, 1, 1 + vinserti128 ym8, [wm_444_mask+32], 1 + vpermb ym4, ym8, ym4 + mova [maskq], ym4 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm1 + RET +.w8_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 64 + lea dstq, [dstq+strideq*4] +.w8_h8: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + mova [maskq], m4 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 + lea dstq, [dstq+strideq*4] + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 + sub hd, 8 + jg .w8_loop + RET +.w16_loop: + add tmp1q, 128 + add tmp2q, 128 + add maskq, 64 + lea dstq, [dstq+strideq*4] +.w16: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + vpermq m0, m0, q3120 + mova [maskq], m4 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], m0, 2 + vextracti32x4 [dstq+strideq*2], ym0, 1 + vextracti32x4 [dstq+stride3q ], m0, 3 + sub hd, 4 + jg .w16_loop + RET +.w32: + pmovzxbq m9, [pb_02461357] +.w32_loop: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + add tmp1q, 128 + add tmp2q, 128 + vpermq m0, m9, m0 + mova [maskq], m4 + add maskq, 64 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + pmovzxbq m9, [pb_02461357] +.w64_loop: + W_MASK 0, 4, 0, 1, 1 + vpermb m4, m8, m4 + add tmp1q, 128 + add tmp2q, 128 + vpermq m0, m9, m0 + mova [maskq], m4 + add maskq, 64 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET +.w128: + pmovzxbq m11, [pb_02461357] +.w128_loop: + W_MASK 0, 4, 0, 1, 1 + W_MASK 10, 9, 2, 3, 1 + vpermb m4, m8, m4 + vpermb m9, m8, m9 + add tmp1q, 256 + add tmp2q, 256 + vpermq m0, m11, m0 + vpermq m10, m11, m10 + mova [maskq+64*0], m4 + mova [maskq+64*1], m9 + add maskq, 128 + mova [dstq+64*0], m0 + mova [dstq+64*1], m10 + add dstq, strideq + dec hd + jg .w128_loop + RET + +cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask +%define base r6-blend_avx512icl_table + lea r6, [blend_avx512icl_table] + tzcnt wd, wm + movifnidn maskq, maskmp + movifnidn hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastd m6, [base+pb_64] + vpbroadcastd m7, [base+pw_512] + sub tmpq, maskq + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + vpbroadcastd xmm1, [dstq+dsq*2] + pinsrd xmm1, [dstq+r6 ], 3 + mova xmm4, [maskq] + mova xmm5, [maskq+tmpq] + add maskq, 4*4 + psubb xmm3, xm6, xmm4 + punpcklbw xmm0, xmm5 + punpcklbw xmm2, xmm3, xmm4 + punpckhbw xmm1, xmm5 + punpckhbw xmm3, xmm4 + pmaddubsw xmm0, xmm2 + pmaddubsw xmm1, xmm3 + pmulhrsw xmm0, xm7 + pmulhrsw xmm1, xm7 + packuswb xmm0, xmm1 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + pextrd [dstq+dsq*2], xmm0, 2 + pextrd [dstq+r6 ], xmm0, 3 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w4 + RET +.w8: + movq xmm0, [dstq+dsq*0] + vpbroadcastq xmm1, [dstq+dsq*1] + vpbroadcastq ymm2, [dstq+dsq*2] + vpbroadcastq ymm3, [dstq+r6 ] + mova ymm4, [maskq] + mova ymm5, [maskq+tmpq] + add maskq, 8*4 + vpblendd ymm0, ymm2, 0x30 + vpblendd ymm1, ymm3, 0xc0 + psubb ymm3, ym6, ymm4 + punpcklbw ymm0, ymm5 + punpcklbw ymm2, ymm3, ymm4 + punpckhbw ymm1, ymm5 + punpckhbw ymm3, ymm4 + pmaddubsw ymm0, ymm2 + pmaddubsw ymm1, ymm3 + pmulhrsw ymm0, ym7 + pmulhrsw ymm1, ym7 + packuswb ymm0, ymm1 + vextracti128 xmm1, ymm0, 1 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + movq [dstq+dsq*2], xmm1 + movhps [dstq+r6 ], xmm1 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w8 + vzeroupper + RET +.w16: + mova xm1, [dstq+dsq*0] + vinserti32x4 ym1, [dstq+dsq*1], 1 + vinserti32x4 m1, [dstq+dsq*2], 2 + mova m4, [maskq] + vinserti32x4 m1, [dstq+r6 ], 3 + mova m5, [maskq+tmpq] + add maskq, 16*4 + psubb m3, m6, m4 + punpcklbw m0, m1, m5 + punpcklbw m2, m3, m4 + punpckhbw m1, m5 + punpckhbw m3, m4 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m7 + pmulhrsw m1, m7 + packuswb m0, m1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], ym0, 1 + vextracti32x4 [dstq+dsq*2], m0, 2 + vextracti32x4 [dstq+r6 ], m0, 3 + lea dstq, [dstq+dsq*4] + sub hd, 4 + jg .w16 + RET +.w32: + mova ym1, [dstq+dsq*0] + vinserti32x8 m1, [dstq+dsq*1], 1 + mova m4, [maskq] + mova m5, [maskq+tmpq] + add maskq, 32*2 + psubb m3, m6, m4 + punpcklbw m0, m1, m5 + punpcklbw m2, m3, m4 + punpckhbw m1, m5 + punpckhbw m3, m4 + pmaddubsw m0, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m7 + pmulhrsw m1, m7 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32 + RET + +cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_v_avx512icl_table + lea r5, [blend_v_avx512icl_table] + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m5, [base+pw_512] + add wq, r5 + add maskq, obmc_masks-blend_v_avx512icl_table + jmp wq +.w2: + vpbroadcastd xmm2, [maskq+2*2] +.w2_s0_loop: + movd xmm0, [dstq+dsq*0] + pinsrw xmm0, [dstq+dsq*1], 1 + movd xmm1, [tmpq] + add tmpq, 2*2 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm2 + pmulhrsw xmm0, xm5 + packuswb xmm0, xmm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w2_s0_loop + RET +.w4: + vpbroadcastq xmm2, [maskq+4*2] +.w4_loop: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movq xmm1, [tmpq] + add tmpq, 4*2 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm2 + pmulhrsw xmm0, xm5 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + mova xmm3, [maskq+8*2] +.w8_loop: + movq xmm0, [dstq+dsq*0] + vpbroadcastq xmm1, [dstq+dsq*1] + mova xmm2, [tmpq] + add tmpq, 8*2 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm3 + pmaddubsw xmm1, xmm3 + pmulhrsw xmm0, xm5 + pmulhrsw xmm1, xm5 + packuswb xmm0, xmm1 + movq [dstq+dsq*0], xmm0 + movhps [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + vbroadcasti32x4 ym3, [maskq+16*2] + vbroadcasti32x4 ym4, [maskq+16*3] +.w16_loop: + mova xm1, [dstq+dsq*0] + vinserti32x4 ym1, [dstq+dsq*1], 1 + mova ym2, [tmpq] + add tmpq, 16*2 + punpcklbw ym0, ym1, ym2 + punpckhbw ym1, ym2 + pmaddubsw ym0, ym3 + pmaddubsw ym1, ym4 + pmulhrsw ym0, ym5 + pmulhrsw ym1, ym5 + packuswb ym0, ym1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w16_loop + RET +.w32: + mova m4, [maskq+32*2] + vshufi32x4 m3, m4, m4, q2020 + vshufi32x4 m4, m4, q3131 +.w32_loop: + mova ym1, [dstq+dsq*0] + vinserti32x8 m1, [dstq+dsq*1], 1 + mova m2, [tmpq] + add tmpq, 32*2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w32_loop + RET + +cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask +%define base r6-blend_h_avx512icl_table + lea r6, [blend_h_avx512icl_table] + tzcnt wd, wm + mov hd, hm + movsxd wq, [r6+wq*4] + lea maskq, [base+obmc_masks+hq*2] + vpbroadcastd m5, [base+pw_512] + lea hd, [hq*3] + add wq, r6 + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd xmm0, [dstq+dsq*0] + pinsrw xmm0, [dstq+dsq*1], 1 + movd xmm2, [maskq+hq*2] + movd xmm1, [tmpq] + add tmpq, 2*2 + punpcklwd xmm2, xmm2 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm2 + pmulhrsw xmm0, xm5 + packuswb xmm0, xmm0 + pextrw [dstq+dsq*0], xmm0, 0 + pextrw [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w2 + RET +.w4: + mova xmm3, [blend_shuf] +.w4_loop: + movd xmm0, [dstq+dsq*0] + pinsrd xmm0, [dstq+dsq*1], 1 + movd xmm2, [maskq+hq*2] + movq xmm1, [tmpq] + add tmpq, 4*2 + pshufb xmm2, xmm3 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm2 + pmulhrsw xmm0, xm5 + packuswb xmm0, xmm0 + movd [dstq+dsq*0], xmm0 + pextrd [dstq+dsq*1], xmm0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w4_loop + RET +.w8: + vbroadcasti128 ymm4, [blend_shuf] + shufpd ymm4, ymm4, 0x03 +.w8_loop: + vpbroadcastq ymm1, [dstq+dsq*0] + movq xmm0, [dstq+dsq*1] + vpblendd ymm0, ymm1, 0x30 + vpbroadcastd ymm3, [maskq+hq*2] + movq xmm1, [tmpq+8*1] + vinserti128 ymm1, [tmpq+8*0], 1 + add tmpq, 8*2 + pshufb ymm3, ymm4 + punpcklbw ymm0, ymm1 + pmaddubsw ymm0, ymm3 + pmulhrsw ymm0, ym5 + vextracti128 xmm1, ymm0, 1 + packuswb xmm0, xmm1 + movhps [dstq+dsq*0], xmm0 + movq [dstq+dsq*1], xmm0 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w8_loop + vzeroupper + RET +.w16: + vbroadcasti32x4 ym4, [blend_shuf] + shufpd ym4, ym4, 0x0c +.w16_loop: + mova xm1, [dstq+dsq*0] + vinserti32x4 ym1, [dstq+dsq*1], 1 + vpbroadcastd ym3, [maskq+hq*2] + mova ym2, [tmpq] + add tmpq, 16*2 + pshufb ym3, ym4 + punpcklbw ym0, ym1, ym2 + punpckhbw ym1, ym2 + pmaddubsw ym0, ym3 + pmaddubsw ym1, ym3 + pmulhrsw ym0, ym5 + pmulhrsw ym1, ym5 + packuswb ym0, ym1 + mova [dstq+dsq*0], xm0 + vextracti32x4 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w16_loop + RET +.w32: + vbroadcasti32x4 m4, [blend_shuf] + shufpd m4, m4, 0xf0 +.w32_loop: + mova ym1, [dstq+dsq*0] + vinserti32x8 m1, [dstq+dsq*1], 1 + vpbroadcastd m3, [maskq+hq*2] + mova m2, [tmpq] + add tmpq, 32*2 + pshufb m3, m4 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq+dsq*0], ym0 + vextracti32x8 [dstq+dsq*1], m0, 1 + lea dstq, [dstq+dsq*2] + add hq, 2 + jl .w32_loop + RET +.w64: + vpbroadcastw m3, [maskq+hq*2] + mova m1, [dstq] + mova m2, [tmpq] + add tmpq, 32*2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + inc hq + jl .w64 + RET +.w128: + vpbroadcastw m6, [maskq+hq*2] + mova m2, [dstq+64*0] + mova m1, [tmpq+64*0] + mova m3, [dstq+64*1] + mova m4, [tmpq+64*1] + add tmpq, 64*2 + punpcklbw m0, m2, m1 + punpckhbw m2, m1 + pmaddubsw m0, m6 + pmaddubsw m2, m6 + punpcklbw m1, m3, m4 + punpckhbw m3, m4 + pmaddubsw m1, m6 + pmaddubsw m3, m6 + REPX {pmulhrsw x, m5}, m0, m2, m1, m3 + packuswb m0, m2 + packuswb m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, dsq + inc hq + jl .w128 + RET + +cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + mov r6, ~0 + vpbroadcastd m5, dxm + vpbroadcastd m8, mx0m + vpbroadcastd m6, src_wm + kmovq k3, r6 + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x + LEA r7, $$ +%define base r7-$$ + vpbroadcastd m3, [base+pw_m256] + vpbroadcastd m7, [base+pd_63] + vbroadcasti32x4 m15, [base+pb_8x0_8x8] + vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] + pslld m5, 4 ; dx*16 + pslld m6, 14 + pxor m2, m2 + mova m16, [base+resize_permA] + mova m17, [base+resize_permB] + mova xm18, [base+resize_permC] +.loop_y: + xor xd, xd + mova m4, m8 ; per-line working version of mx +.loop_x: + pmaxsd m0, m4, m2 + psrad m9, m4, 8 ; filter offset (unmasked) + pminsd m0, m6 ; iclip(mx, 0, src_w-8) + psubd m1, m4, m0 ; pshufb offset + psrad m0, 14 ; clipped src_x offset + psrad m1, 14 ; pshufb edge_emu offset + vptestmd k4, m1, m1 + pand m9, m7 ; filter offset (masked) + ktestw k4, k4 + jz .load + vextracti32x8 ym12, m0, 1 + vextracti32x8 ym13, m1, 1 + kmovq k1, k3 + kmovq k2, k3 + vpgatherdq m10{k1}, [srcq+ym0] + vpgatherdq m11{k2}, [srcq+ym12] + kmovq k1, k3 + kmovq k2, k3 + vpgatherdq m14{k1}, [base+resize_shuf+4+ym1] + vpgatherdq m0{k2}, [base+resize_shuf+4+ym13] + mova m12, m16 + mova m13, m17 + paddb m14, m15 + paddb m0, m15 + pshufb m10, m14 + pshufb m11, m0 + vpermi2d m12, m10, m11 + vpermi2d m13, m10, m11 + jmp .filter +.load: + kmovq k1, k3 + kmovq k2, k3 + vpgatherdd m12{k1}, [srcq+m0+0] + vpgatherdd m13{k2}, [srcq+m0+4] +.filter: + kmovq k1, k3 + kmovq k2, k3 + vpgatherdd m10{k1}, [base+resize_filter+m9*8+0] + vpgatherdd m11{k2}, [base+resize_filter+m9*8+4] + mova m14, m2 + vpdpbusd m14, m12, m10 + vpdpbusd m14, m13, m11 + packssdw m14, m14 + pmulhrsw m14, m3 + packuswb m14, m14 + vpermd m14, m18, m14 + mova [dstq+xq], xm14 + paddd m4, m5 + add xd, 16 + cmp xd, dst_wd + jl .loop_x + add dstq, dst_strideq + add srcq, src_strideq + dec hd + jg .loop_y + RET + +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/mc_sse.asm b/third_party/dav1d/src/x86/mc_sse.asm new file mode 100644 index 0000000000..54939c647a --- /dev/null +++ b/third_party/dav1d/src/x86/mc_sse.asm @@ -0,0 +1,9599 @@ +; Copyright © 2018, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; Copyright © 2018, VideoLabs +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 16 + +; dav1d_obmc_masks[] with 64-x interleaved +obmc_masks: db 0, 0, 0, 0 + ; 2 @4 + db 45, 19, 64, 0 + ; 4 @8 + db 39, 25, 50, 14, 59, 5, 64, 0 + ; 8 @16 + db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 + ; 16 @32 + db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 + db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 + ; 32 @64 + db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 + db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 + db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 + +warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 +warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 +warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 +warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 +subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 +subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +rescale_mul: dd 0, 1, 2, 3 +resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 + +wm_420_sign: times 4 dw 258 + times 4 dw 257 +wm_422_sign: times 8 db 128 + times 8 db 127 + +pb_8x0_8x8: times 8 db 0 + times 8 db 8 +bdct_lb_dw: times 4 db 0 + times 4 db 4 + times 4 db 8 + times 4 db 12 + +pb_64: times 16 db 64 +pw_m256: times 8 dw -256 +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 +pw_8: times 8 dw 8 +pw_15: times 8 dw 15 +pw_26: times 8 dw 26 +pw_34: times 8 dw 34 +pw_512: times 8 dw 512 +pw_1024: times 8 dw 1024 +pw_2048: times 8 dw 2048 +pw_6903: times 8 dw 6903 +pw_8192: times 8 dw 8192 +pd_32: times 4 dd 32 +pd_63: times 4 dd 63 +pd_512: times 4 dd 512 +pd_16384: times 4 dd 16484 +pd_32768: times 4 dd 32768 +pd_262144:times 4 dd 262144 +pd_0x3ff: times 4 dd 0x3ff +pd_0x4000:times 4 dd 0x4000 +pq_0x40000000: times 2 dq 0x40000000 + +const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage + ; [-1, 0) + db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0 + db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0 + db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0 + db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0 + db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0 + db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0 + db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0 + db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0 + db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0 + db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0 + db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0 + db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0 + db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0 + db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0 + db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0 + db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0 + db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0 + db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0 + db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0 + db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0 + db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0 + db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0 + db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0 + db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0 + db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0 + db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0 + db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0 + db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0 + db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0 + db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0 + db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0 + db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0 + ; [0, 1) + db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0 + db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0 + db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1 + db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1 + db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1 + db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1 + db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1 + db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1 + db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2 + db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2 + db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2 + db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2 + db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2 + db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2 + db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2 + db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2 + db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2 + db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2 + db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2 + db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2 + db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2 + db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2 + db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2 + db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2 + db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2 + db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1 + db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2 + db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1 + db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1 + db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1 + db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0 + db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0 + ; [1, 2) + db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0 + db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1 + db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1 + db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1 + db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1 + db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2 + db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2 + db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2 + db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3 + db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3 + db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3 + db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4 + db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4 + db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4 + db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4 + db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4 + db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4 + db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4 + db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4 + db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4 + db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4 + db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4 + db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4 + db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3 + db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3 + db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3 + db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2 + db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2 + db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2 + db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1 + db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1 + db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0 + db 0, 0, 2, -1, 0, 0, 127, 0 + +pw_258: times 2 dw 258 + +cextern mc_subpel_filters +%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) + +%macro BIDIR_JMP_TABLE 2-* + ;evaluated at definition time (in loop below) + %xdefine %1_%2_table (%%table - 2*%3) + %xdefine %%base %1_%2_table + %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) + ; dynamically generated label + %%table: + %rep %0 - 2 ; repeat for num args + dd %%prefix %+ .w%3 - %%base + %rotate 1 + %endrep +%endmacro + +BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16 + +%macro BASE_JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base %1_%2 + %%table: + %rep %0 - 2 + dw %%base %+ _w%3 - %%base + %rotate 1 + %endrep +%endmacro + +%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep) +%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put) +%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep) + +BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 + +%macro HV_JMP_TABLE 5-* + %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) + %xdefine %%base %1_%3 + %assign %%types %4 + %if %%types & 1 + %xdefine %1_%2_h_%3_table (%%h - %5) + %%h: + %rep %0 - 4 + dw %%prefix %+ .h_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 2 + %xdefine %1_%2_v_%3_table (%%v - %5) + %%v: + %rep %0 - 4 + dw %%prefix %+ .v_w%5 - %%base + %rotate 1 + %endrep + %rotate 4 + %endif + %if %%types & 4 + %xdefine %1_%2_hv_%3_table (%%hv - %5) + %%hv: + %rep %0 - 4 + dw %%prefix %+ .hv_w%5 - %%base + %rotate 1 + %endrep + %endif +%endmacro + +HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 + +%macro SCALED_JMP_TABLE 2-* + %xdefine %1_%2_table (%%table - %3) + %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) +%%table: + %rep %0 - 2 + dw %%base %+ .w%3 - %%base + %rotate 1 + %endrep + %rotate 2 +%%dy_1024: + %xdefine %1_%2_dy1_table (%%dy_1024 - %3) + %rep %0 - 2 + dw %%base %+ .dy1_w%3 - %%base + %rotate 1 + %endrep + %rotate 2 +%%dy_2048: + %xdefine %1_%2_dy2_table (%%dy_2048 - %3) + %rep %0 - 2 + dw %%base %+ .dy2_w%3 - %%base + %rotate 1 + %endrep +%endmacro + +SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 + +%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX + +SECTION .text + +INIT_XMM ssse3 + +%if ARCH_X86_32 + DECLARE_REG_TMP 1 + %define base t0-put_ssse3 +%else + DECLARE_REG_TMP 7 + %define base 0 +%endif + +%macro RESTORE_DSQ_32 1 + %if ARCH_X86_32 + mov %1, dsm ; restore dsq + %endif +%endmacro + +cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy + movifnidn mxyd, r6m ; mx + LEA t0, put_ssse3 + movifnidn srcq, srcmp + movifnidn ssq, ssmp + tzcnt wd, wm + mov hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r7m ; my + test mxyd, mxyd + jnz .v +.put: + movzx wd, word [t0+wq*2+table_offset(put,)] + add wq, t0 + RESTORE_DSQ_32 t0 + jmp wq +.put_w2: + movzx r4d, word [srcq+ssq*0] + movzx r6d, word [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r4w + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w2 + RET +.put_w4: + mov r4d, [srcq+ssq*0] + mov r6d, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mov [dstq+dsq*0], r4d + mov [dstq+dsq*1], r6d + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w4 + RET +.put_w8: + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movq [dstq+dsq*0], m0 + movq [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w8 + RET +.put_w16: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0], m0 + mova [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w16 + RET +.put_w32: + movu m0, [srcq+ssq*0+16*0] + movu m1, [srcq+ssq*0+16*1] + movu m2, [srcq+ssq*1+16*0] + movu m3, [srcq+ssq*1+16*1] + lea srcq, [srcq+ssq*2] + mova [dstq+dsq*0+16*0], m0 + mova [dstq+dsq*0+16*1], m1 + mova [dstq+dsq*1+16*0], m2 + mova [dstq+dsq*1+16*1], m3 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .put_w32 + RET +.put_w64: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + add srcq, ssq + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + add dstq, dsq + dec hd + jg .put_w64 + RET +.put_w128: + movu m0, [srcq+16*0] + movu m1, [srcq+16*1] + movu m2, [srcq+16*2] + movu m3, [srcq+16*3] + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + mova [dstq+16*2], m2 + mova [dstq+16*3], m3 + movu m0, [srcq+16*4] + movu m1, [srcq+16*5] + movu m2, [srcq+16*6] + movu m3, [srcq+16*7] + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + mova [dstq+16*6], m2 + mova [dstq+16*7], m3 + add srcq, ssq + add dstq, dsq + dec hd + jg .put_w128 + RET +.h: + ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 + ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 + imul mxyd, 0x00ff00ff + mova m4, [base+bilin_h_shuf8] + mova m0, [base+bilin_h_shuf4] + add mxyd, 0x00100010 + movd m5, mxyd + mov mxyd, r7m ; my + pshufd m5, m5, q0000 + test mxyd, mxyd + jnz .hv + movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] + mova m3, [base+pw_2048] + add wq, t0 + movifnidn dsq, dsmp + jmp wq +.h_w2: + pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} +.h_w2_loop: + movd m0, [srcq+ssq*0] + movd m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpckldq m0, m1 + pshufb m0, m4 + pmaddubsw m0, m5 + pmulhrsw m0, m3 + packuswb m0, m0 + movd r6d, m0 + mov [dstq+dsq*0], r6w + shr r6d, 16 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: + movq m4, [srcq+ssq*0] + movhps m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m4, m0 + pmaddubsw m4, m5 + pmulhrsw m4, m3 + packuswb m4, m4 + movd [dstq+dsq*0], m4 + psrlq m4, 32 + movd [dstq+dsq*1], m4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4 + RET +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w8 + RET +.h_w16: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + add srcq, ssq + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq], m0 + add dstq, dsq + dec hd + jg .h_w16 + RET +.h_w32: + movu m0, [srcq+mmsize*0+8*0] + movu m1, [srcq+mmsize*0+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + movu m1, [srcq+mmsize*1+8*0] + movu m2, [srcq+mmsize*1+8*1] + add srcq, ssq + pshufb m1, m4 + pshufb m2, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + packuswb m1, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, dsq + dec hd + jg .h_w32 + RET +.h_w64: + mov r6, -16*3 +.h_w64_loop: + movu m0, [srcq+r6+16*3+8*0] + movu m1, [srcq+r6+16*3+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+r6+16*3], m0 + add r6, 16 + jle .h_w64_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w64 + RET +.h_w128: + mov r6, -16*7 +.h_w128_loop: + movu m0, [srcq+r6+16*7+8*0] + movu m1, [srcq+r6+16*7+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m3 + packuswb m0, m1 + mova [dstq+r6+16*7], m0 + add r6, 16 + jle .h_w128_loop + add srcq, ssq + add dstq, dsq + dec hd + jg .h_w128 + RET +.v: + movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] + imul mxyd, 0x00ff00ff + mova m5, [base+pw_2048] + add mxyd, 0x00100010 + add wq, t0 + movd m4, mxyd + pshufd m4, m4, q0000 + movifnidn dsq, dsmp + jmp wq +.v_w2: + movd m0, [srcq+ssq*0] +.v_w2_loop: + pinsrw m0, [srcq+ssq*1], 1 ; 0 1 + lea srcq, [srcq+ssq*2] + pshuflw m1, m0, q2301 + pinsrw m0, [srcq+ssq*0], 0 ; 2 1 + punpcklbw m1, m0 + pmaddubsw m1, m4 + pmulhrsw m1, m5 + packuswb m1, m1 + movd r6d, m1 + mov [dstq+dsq*1], r6w + shr r6d, 16 + mov [dstq+dsq*0], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: + movd m0, [srcq+ssq*0] +.v_w4_loop: + movd m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m1, m0 + movd m0, [srcq+ssq*0] + punpckldq m1, m2 ; 0 1 + punpckldq m2, m0 ; 1 2 + punpcklbw m1, m2 + pmaddubsw m1, m4 + pmulhrsw m1, m5 + packuswb m1, m1 + movd [dstq+dsq*0], m1 + psrlq m1, 32 + movd [dstq+dsq*1], m1 + ; + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop + RET +.v_w8: + movq m0, [srcq+ssq*0] +.v_w8_loop: + movq m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m1, m0 + movq m0, [srcq+ssq*0] + punpcklbw m1, m2 + punpcklbw m2, m0 + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + RET +%macro PUT_BILIN_V_W16 0 + movu m0, [srcq+ssq*0] +%%loop: + movu m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m1, m0 + mova m2, m0 + movu m0, [srcq+ssq*0] + punpcklbw m1, m3 + punpckhbw m2, m3 + pmaddubsw m1, m4 + pmaddubsw m2, m4 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + packuswb m1, m2 + punpcklbw m2, m3, m0 + punpckhbw m3, m0 + pmaddubsw m2, m4 + pmaddubsw m3, m4 + pmulhrsw m2, m5 + pmulhrsw m3, m5 + packuswb m2, m3 + mova [dstq+dsq*0], m1 + mova [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg %%loop +%endmacro +.v_w16: + PUT_BILIN_V_W16 + RET +.v_w128: + lea r6d, [hq+(7<<16)] + jmp .v_w16gt +.v_w64: + lea r6d, [hq+(3<<16)] + jmp .v_w16gt +.v_w32: + lea r6d, [hq+(1<<16)] +.v_w16gt: + mov r4, srcq +%if ARCH_X86_64 + mov r7, dstq +%endif +.v_w16gt_loop: + PUT_BILIN_V_W16 +%if ARCH_X86_64 + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 +%else + mov dstq, dstmp + add r4, 16 + movzx hd, r6w + add dstq, 16 + mov srcq, r4 + mov dstmp, dstq +%endif + sub r6d, 1<<16 + jg .v_w16gt + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 + ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 + movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] + WIN64_SPILL_XMM 8 + shl mxyd, 11 ; can't shift by 12 due to signed overflow + mova m7, [base+pw_15] + movd m6, mxyd + add wq, t0 + pshuflw m6, m6, q0000 + paddb m5, m5 + punpcklqdq m6, m6 + jmp wq +.hv_w2: + RESTORE_DSQ_32 t0 + movd m0, [srcq+ssq*0] + punpckldq m0, m0 + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w2_loop: + movd m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m2, [srcq+ssq*0] + punpckldq m1, m2 + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 _ 2 _ + shufps m2, m0, m1, q1032 ; 0 _ 1 _ + mova m0, m1 + psubw m1, m2 ; 2 * (src[x + src_stride] - src[x]) + pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4 + pavgw m2, m7 ; src[x] + 8 + paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8 + psrlw m1, 4 + packuswb m1, m1 +%if ARCH_X86_64 + movq r6, m1 +%else + pshuflw m1, m1, q2020 + movd r6d, m1 +%endif + mov [dstq+dsq*0], r6w + shr r6, gprsize*4 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +.hv_w4: + mova m4, [base+bilin_h_shuf4] + movddup m0, [srcq+ssq*0] + movifnidn dsq, dsmp + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w4_loop: + movq m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movhps m1, [srcq+ssq*0] + pshufb m1, m4 + pmaddubsw m1, m5 ; 1 2 + shufps m2, m0, m1, q1032 ; 0 1 + mova m0, m1 + psubw m1, m2 + pmulhw m1, m6 + pavgw m2, m7 + paddw m1, m2 + psrlw m1, 4 + packuswb m1, m1 + movd [dstq+dsq*0], m1 + psrlq m1, 32 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w4_loop + RET +.hv_w8: + movu m0, [srcq+ssq*0] + movifnidn dsq, dsmp + pshufb m0, m4 + pmaddubsw m0, m5 +.hv_w8_loop: + movu m2, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m2, m4 + pmaddubsw m2, m5 + psubw m1, m2, m0 + pmulhw m1, m6 + pavgw m0, m7 + paddw m1, m0 + movu m0, [srcq+ssq*0] + pshufb m0, m4 + pmaddubsw m0, m5 + psubw m3, m0, m2 + pmulhw m3, m6 + pavgw m2, m7 + paddw m3, m2 + psrlw m1, 4 + psrlw m3, 4 + packuswb m1, m3 + movq [dstq+dsq*0], m1 + movhps [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w128: + lea r6d, [hq+(7<<16)] + jmp .hv_w16_start +.hv_w64: + lea r6d, [hq+(3<<16)] + jmp .hv_w16_start +.hv_w32: + lea r6d, [hq+(1<<16)] +.hv_w16_start: + mov r4, srcq +%if ARCH_X86_32 + %define m8 [dstq] +%else + mov r7, dstq +%endif +.hv_w16: + movifnidn dsq, dsmp +%if WIN64 + movaps r4m, m8 +%endif +.hv_w16_loop0: + movu m0, [srcq+8*0] + movu m1, [srcq+8*1] + pshufb m0, m4 + pshufb m1, m4 + pmaddubsw m0, m5 + pmaddubsw m1, m5 +.hv_w16_loop: + add srcq, ssq + movu m2, [srcq+8*0] + movu m3, [srcq+8*1] + pshufb m2, m4 + pshufb m3, m4 + pmaddubsw m2, m5 + pmaddubsw m3, m5 + mova m8, m2 + psubw m2, m0 + pmulhw m2, m6 + pavgw m0, m7 + paddw m2, m0 + mova m0, m3 + psubw m3, m1 + pmulhw m3, m6 + pavgw m1, m7 + paddw m3, m1 + mova m1, m0 + mova m0, m8 + psrlw m2, 4 + psrlw m3, 4 + packuswb m2, m3 + mova [dstq], m2 + add dstq, dsmp + dec hd + jg .hv_w16_loop +%if ARCH_X86_32 + mov dstq, dstm + add r4, 16 + movzx hd, r6w + add dstq, 16 + mov srcq, r4 + mov dstm, dstq +%else + add r4, 16 + add r7, 16 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 +%endif + sub r6d, 1<<16 + jg .hv_w16_loop0 +%if WIN64 + movaps m8, r4m +%endif + RET + +%macro PSHUFB_BILIN_H8 2 ; dst, src + %if cpuflag(ssse3) + pshufb %1, %2 + %else + psrldq %2, %1, 1 + punpcklbw %1, %2 + %endif +%endmacro + +%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp + %if cpuflag(ssse3) + pshufb %1, %2 + %else + psrldq %2, %1, 1 + punpckhbw %3, %1, %2 + punpcklbw %1, %2 + punpcklqdq %1, %3 + %endif +%endmacro + +%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero + %if cpuflag(ssse3) + pmaddubsw %1, %2 + %else + %if %5 == 1 + pxor %3, %3 + %endif + punpckhbw %4, %1, %3 + punpcklbw %1, %1, %3 + pmaddwd %4, %2 + pmaddwd %1, %2 + packssdw %1, %4 + %endif +%endmacro + +%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift + %if cpuflag(ssse3) + pmulhrsw %1, %2 + %else + punpckhwd %3, %1, %4 + punpcklwd %1, %4 + pmaddwd %3, %2 + pmaddwd %1, %2 + psrad %3, %5 + psrad %1, %5 + packssdw %1, %3 + %endif +%endmacro + +%macro PREP_BILIN 0 +%if ARCH_X86_32 + %define base r6-prep%+SUFFIX +%else + %define base 0 +%endif + +cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 + movifnidn mxyd, r5m ; mx + LEA r6, prep%+SUFFIX + tzcnt wd, wm + movifnidn hd, hm + test mxyd, mxyd + jnz .h + mov mxyd, r6m ; my + test mxyd, mxyd + jnz .v +.prep: +%if notcpuflag(ssse3) + add r6, prep_ssse3 - prep_sse2 + jmp prep_ssse3 +%else + movzx wd, word [r6+wq*2+table_offset(prep,)] + pxor m4, m4 + add wq, r6 + lea stride3q, [strideq*3] + jmp wq +.prep_w4: + movd m0, [srcq+strideq*0] + movd m1, [srcq+strideq*1] + movd m2, [srcq+strideq*2] + movd m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + punpckldq m0, m1 + punpckldq m2, m3 + punpcklbw m0, m4 + punpcklbw m2, m4 + psllw m0, 4 + psllw m2, 4 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 4 + jg .prep_w4 + RET +.prep_w8: + movq m0, [srcq+strideq*0] + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*2] + movq m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + punpcklbw m0, m4 + punpcklbw m1, m4 + punpcklbw m2, m4 + punpcklbw m3, m4 + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .prep_w8 + RET +.prep_w16: + movu m1, [srcq+strideq*0] + movu m3, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 2 + jg .prep_w16 + RET +.prep_w128: + mov r3, -128 + jmp .prep_w32_start +.prep_w64: + mov r3, -64 + jmp .prep_w32_start +.prep_w32: + mov r3, -32 +.prep_w32_start: + sub srcq, r3 +.prep_w32_vloop: + mov r6, r3 +.prep_w32_hloop: + movu m1, [srcq+r6+16*0] + movu m3, [srcq+r6+16*1] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + psllw m0, 4 + psllw m1, 4 + psllw m2, 4 + psllw m3, 4 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + add r6, 32 + jl .prep_w32_hloop + add srcq, strideq + dec hd + jg .prep_w32_vloop + RET +%endif +.h: + ; 16 * src[x] + (mx * (src[x + 1] - src[x])) + ; = (16 - mx) * src[x] + mx * src[x + 1] +%if cpuflag(ssse3) + imul mxyd, 0x00ff00ff + mova m4, [base+bilin_h_shuf8] + add mxyd, 0x00100010 +%else + imul mxyd, 0xffff + add mxyd, 16 +%endif + movd m5, mxyd + mov mxyd, r6m ; my + pshufd m5, m5, q0000 + test mxyd, mxyd + jnz .hv + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] +%if notcpuflag(ssse3) + WIN64_SPILL_XMM 8 + pxor m6, m6 +%endif + add wq, r6 + jmp wq +.h_w4: +%if cpuflag(ssse3) + mova m4, [base+bilin_h_shuf4] +%endif + lea stride3q, [strideq*3] +.h_w4_loop: + movq m0, [srcq+strideq*0] + movhps m0, [srcq+strideq*1] + movq m1, [srcq+strideq*2] + movhps m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + PSHUFB_BILIN_H4 m0, m4, m2 + PMADDUBSW m0, m5, m6, m2, 0 + PSHUFB_BILIN_H4 m1, m4, m2 + PMADDUBSW m1, m5, m6, m2, 0 + mova [tmpq+0 ], m0 + mova [tmpq+16], m1 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: + lea stride3q, [strideq*3] +.h_w8_loop: + movu m0, [srcq+strideq*0] + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + movu m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .h_w8_loop + RET +.h_w16: + movu m0, [srcq+strideq*0+8*0] + movu m1, [srcq+strideq*0+8*1] + movu m2, [srcq+strideq*1+8*0] + movu m3, [srcq+strideq*1+8*1] + lea srcq, [srcq+strideq*2] + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 2 + jg .h_w16 + RET +.h_w128: + mov r3, -128 + jmp .h_w32_start +.h_w64: + mov r3, -64 + jmp .h_w32_start +.h_w32: + mov r3, -32 +.h_w32_start: + sub srcq, r3 +.h_w32_vloop: + mov r6, r3 +.h_w32_hloop: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + movu m2, [srcq+r6+8*2] + movu m3, [srcq+r6+8*3] + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + add r6, 32 + jl .h_w32_hloop + add srcq, strideq + dec hd + jg .h_w32_vloop + RET +.v: +%if notcpuflag(ssse3) + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 8 +%endif + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] +%if cpuflag(ssse3) + imul mxyd, 0x00ff00ff + add mxyd, 0x00100010 +%else + imul mxyd, 0xffff + pxor m6, m6 + add mxyd, 16 +%endif + add wq, r6 + lea stride3q, [strideq*3] + movd m5, mxyd + pshufd m5, m5, q0000 + jmp wq +.v_w4: + movd m0, [srcq+strideq*0] +.v_w4_loop: + movd m1, [srcq+strideq*1] + movd m2, [srcq+strideq*2] + movd m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + punpckldq m0, m1 + punpckldq m1, m2 + punpcklbw m0, m1 ; 01 12 + PMADDUBSW m0, m5, m6, m7, 0 + mova [tmpq+16*0], m0 + movd m0, [srcq+strideq*0] + punpckldq m2, m3 + punpckldq m3, m0 + punpcklbw m2, m3 ; 23 34 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 4 + jg .v_w4_loop + RET +.v_w8: + movq m0, [srcq+strideq*0] +.v_w8_loop: + movq m1, [srcq+strideq*1] + movq m2, [srcq+strideq*2] + movq m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + punpcklbw m0, m1 ; 01 + punpcklbw m1, m2 ; 12 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + mova [tmpq+16*0], m0 + movq m0, [srcq+strideq*0] + punpcklbw m2, m3 ; 23 + punpcklbw m3, m0 ; 34 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*1], m1 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*2], m2 + mova [tmpq+16*3], m3 + add tmpq, 16*4 + sub hd, 4 + jg .v_w8_loop + RET +.v_w16: + movu m0, [srcq+strideq*0] +.v_w16_loop: + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + movu m3, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + punpcklbw m4, m0, m1 + punpckhbw m0, m1 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m0, m5, m6, m7, 0 + mova [tmpq+16*0], m4 + punpcklbw m4, m1, m2 + punpckhbw m1, m2 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*1], m0 + movu m0, [srcq+strideq*0] + PMADDUBSW m1, m5, m6, m7, 0 + mova [tmpq+16*2], m4 + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*3], m1 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*4], m4 + punpcklbw m4, m3, m0 + punpckhbw m3, m0 + PMADDUBSW m4, m5, m6, m7, 0 + mova [tmpq+16*5], m2 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*6], m4 + mova [tmpq+16*7], m3 + add tmpq, 16*8 + sub hd, 4 + jg .v_w16_loop + RET +.v_w128: + lea r3d, [hq+(3<<8)] + mov r6d, 256 + jmp .v_w32_start +.v_w64: + lea r3d, [hq+(1<<8)] + mov r6d, 128 + jmp .v_w32_start +.v_w32: + xor r3d, r3d + mov r6d, 64 +.v_w32_start: +%if ARCH_X86_64 + %if WIN64 + PUSH r7 + %endif + mov r7, tmpq +%endif + mov r5, srcq +.v_w32_hloop: + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] +.v_w32_vloop: + movu m2, [srcq+strideq*1+16*0] + movu m3, [srcq+strideq*1+16*1] + lea srcq, [srcq+strideq*2] + punpcklbw m4, m0, m2 + punpckhbw m0, m2 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m0, m5, m6, m7, 0 + mova [tmpq+16*0], m4 + mova [tmpq+16*1], m0 + movu m0, [srcq+strideq*0+16*0] + punpcklbw m4, m1, m3 + punpckhbw m1, m3 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + mova [tmpq+16*2], m4 + mova [tmpq+16*3], m1 + movu m1, [srcq+strideq*0+16*1] + add tmpq, r6 + punpcklbw m4, m2, m0 + punpckhbw m2, m0 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + mova [tmpq+16*0], m4 + mova [tmpq+16*1], m2 + punpcklbw m4, m3, m1 + punpckhbw m3, m1 + PMADDUBSW m4, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 + mova [tmpq+16*2], m4 + mova [tmpq+16*3], m3 + add tmpq, r6 + sub hd, 2 + jg .v_w32_vloop + add r5, 32 + movzx hd, r3b + mov srcq, r5 +%if ARCH_X86_64 + add r7, 16*4 + mov tmpq, r7 +%else + mov tmpq, tmpmp + add tmpq, 16*4 + mov tmpmp, tmpq +%endif + sub r3d, 1<<8 + jg .v_w32_hloop +%if WIN64 + POP r7 +%endif + RET +.hv: + ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 + ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) + movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] +%assign stack_offset stack_offset - stack_size_padded +%if cpuflag(ssse3) + imul mxyd, 0x08000800 + WIN64_SPILL_XMM 8 +%else + or mxyd, 1<<16 + WIN64_SPILL_XMM 9 + %if ARCH_X86_64 + mova m8, [base+pw_8] + %else + %define m8 [base+pw_8] + %endif + pxor m7, m7 +%endif + movd m6, mxyd + add wq, r6 + pshufd m6, m6, q0000 + jmp wq +.hv_w4: +%if cpuflag(ssse3) + mova m4, [base+bilin_h_shuf4] + movddup m0, [srcq+strideq*0] +%else + movhps m0, [srcq+strideq*0] +%endif + lea r3, [strideq*3] + PSHUFB_BILIN_H4 m0, m4, m3 + PMADDUBSW m0, m5, m7, m4, 0 ; _ 0 +.hv_w4_loop: + movq m1, [srcq+strideq*1] + movhps m1, [srcq+strideq*2] + movq m2, [srcq+r3 ] + lea srcq, [srcq+strideq*4] + movhps m2, [srcq+strideq*0] + PSHUFB_BILIN_H4 m1, m4, m3 + PSHUFB_BILIN_H4 m2, m4, m3 + PMADDUBSW m1, m5, m7, m4, 0 ; 1 2 + PMADDUBSW m2, m5, m7, m4, 0 ; 3 4 + shufpd m0, m1, 0x01 ; 0 1 + shufpd m3, m1, m2, 0x01 ; 2 3 + psubw m1, m0 + PMULHRSW m1, m6, m4, m8, 4 + paddw m1, m0 + mova m0, m2 + psubw m2, m3 + PMULHRSW m2, m6, m4, m8, 4 + paddw m2, m3 + mova [tmpq+16*0], m1 + mova [tmpq+16*1], m2 + add tmpq, 32 + sub hd, 4 + jg .hv_w4_loop + RET +.hv_w8: + movu m0, [srcq+strideq*0] + PSHUFB_BILIN_H8 m0, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 0 +.hv_w8_loop: + movu m1, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movu m2, [srcq+strideq*0] + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PMADDUBSW m1, m5, m7, m4, 0 ; 1 + PMADDUBSW m2, m5, m7, m4, 0 ; 2 + psubw m3, m1, m0 + PMULHRSW m3, m6, m4, m8, 4 + paddw m3, m0 + mova m0, m2 + psubw m2, m1 + PMULHRSW m2, m6, m4, m8, 4 + paddw m2, m1 + mova [tmpq+16*0], m3 + mova [tmpq+16*1], m2 + add tmpq, 16*2 + sub hd, 2 + jg .hv_w8_loop + RET +.hv_w128: + lea r3d, [hq+(7<<8)] + mov r5d, 256 + jmp .hv_w16_start +.hv_w64: + lea r3d, [hq+(3<<8)] + mov r5d, 128 + jmp .hv_w16_start +.hv_w32: + lea r3d, [hq+(1<<8)] + mov r5d, 64 + jmp .hv_w16_start +.hv_w16: + xor r3d, r3d + mov r5d, 32 +.hv_w16_start: +%if ARCH_X86_64 || cpuflag(ssse3) + mov r6, srcq +%endif +%if ARCH_X86_64 + %if WIN64 + PUSH r7 + %endif + mov r7, tmpq +%endif +.hv_w16_hloop: + movu m0, [srcq+strideq*0+8*0] + movu m1, [srcq+strideq*0+8*1] + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 0a + PMADDUBSW m1, m5, m7, m4, 0 ; 0b +.hv_w16_vloop: + movu m2, [srcq+strideq*1+8*0] + PSHUFB_BILIN_H8 m2, m4 + PMADDUBSW m2, m5, m7, m4, 0 ; 1a + psubw m3, m2, m0 + PMULHRSW m3, m6, m4, m8, 4 + paddw m3, m0 + mova [tmpq+16*0], m3 + movu m3, [srcq+strideq*1+8*1] + lea srcq, [srcq+strideq*2] + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m3, m5, m7, m4, 0 ; 1b + psubw m0, m3, m1 + PMULHRSW m0, m6, m4, m8, 4 + paddw m0, m1 + mova [tmpq+16*1], m0 + add tmpq, r5 + movu m0, [srcq+strideq*0+8*0] + PSHUFB_BILIN_H8 m0, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 2a + psubw m1, m0, m2 + PMULHRSW m1, m6, m4, m8, 4 + paddw m1, m2 + mova [tmpq+16*0], m1 + movu m1, [srcq+strideq*0+8*1] + PSHUFB_BILIN_H8 m1, m4 + PMADDUBSW m1, m5, m7, m4, 0 ; 2b + psubw m2, m1, m3 + PMULHRSW m2, m6, m4, m8, 4 + paddw m2, m3 + mova [tmpq+16*1], m2 + add tmpq, r5 + sub hd, 2 + jg .hv_w16_vloop + movzx hd, r3b +%if ARCH_X86_64 + add r6, 16 + add r7, 2*16 + mov srcq, r6 + mov tmpq, r7 +%elif cpuflag(ssse3) + mov tmpq, tmpm + add r6, 16 + add tmpq, 2*16 + mov srcq, r6 + mov tmpm, tmpq +%else + mov srcq, srcm + mov tmpq, tmpm + add srcq, 16 + add tmpq, 2*16 + mov srcm, srcq + mov tmpm, tmpq +%endif + sub r3d, 1<<8 + jg .hv_w16_hloop +%if WIN64 + POP r7 +%endif + RET +%endmacro + +; int8_t subpel_filters[5][15][8] +%assign FILTER_REGULAR (0*15 << 16) | 3*15 +%assign FILTER_SMOOTH (1*15 << 16) | 4*15 +%assign FILTER_SHARP (2*15 << 16) | 3*15 + +%macro FN 4 ; prefix, type, type_h, type_v +cglobal %1_%2_8bpc + mov t0d, FILTER_%3 +%ifidn %3, %4 + mov t1d, t0d +%else + mov t1d, FILTER_%4 +%endif +%ifnidn %2, regular ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) +%endif +%endmacro + +%if ARCH_X86_32 +DECLARE_REG_TMP 1, 2 +%elif WIN64 +DECLARE_REG_TMP 4, 5 +%else +DECLARE_REG_TMP 7, 8 +%endif + +FN put_8tap, sharp, SHARP, SHARP +FN put_8tap, sharp_smooth, SHARP, SMOOTH +FN put_8tap, smooth_sharp, SMOOTH, SHARP +FN put_8tap, smooth, SMOOTH, SMOOTH +FN put_8tap, sharp_regular, SHARP, REGULAR +FN put_8tap, regular_sharp, REGULAR, SHARP +FN put_8tap, smooth_regular, SMOOTH, REGULAR +FN put_8tap, regular_smooth, REGULAR, SMOOTH +FN put_8tap, regular, REGULAR, REGULAR + +%if ARCH_X86_32 + %define base_reg r1 + %define base base_reg-put_ssse3 +%else + %define base_reg r8 + %define base 0 +%endif + +cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 +%assign org_stack_offset stack_offset + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h +%if ARCH_X86_64 + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v +%else + imul ssd, mym, 0x010101 + add ssd, t1d ; 8tap_v, my, 4tap_v + mov srcq, srcm +%endif + mov wd, wm + movifnidn hd, hm + LEA base_reg, put_ssse3 + test mxd, 0xf00 + jnz .h +%if ARCH_X86_32 + test ssd, 0xf00 +%else + test myd, 0xf00 +%endif + jnz .v + tzcnt wd, wd + movzx wd, word [base_reg+wq*2+table_offset(put,)] + add wq, base_reg +; put_bilin mangling jump +%assign stack_offset org_stack_offset + movifnidn dsq, dsmp + movifnidn ssq, ssmp +%if WIN64 + pop r8 +%endif + lea r6, [ssq*3] + jmp wq +.h: +%if ARCH_X86_32 + test ssd, 0xf00 +%else + test myd, 0xf00 +%endif + jnz .hv + movifnidn ssq, ssmp + WIN64_SPILL_XMM 12 + cmp wd, 4 + jl .h_w2 + je .h_w4 + tzcnt wd, wd +%if ARCH_X86_64 + mova m10, [base+subpel_h_shufA] + mova m11, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] +%endif + shr mxd, 16 + sub srcq, 3 + movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)] + movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3] + mova m7, [base+pw_34] ; 2 + (8 << 2) + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 + add wq, base_reg + jmp wq +.h_w2: +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + mova m4, [base+subpel_h_shuf4] + movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] + mova m5, [base+pw_34] ; 2 + (8 << 2) + pshufd m3, m3, q0000 + movifnidn dsq, dsmp +.h_w2_loop: + movq m0, [srcq+ssq*0] + movhps m0, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m0, m4 + pmaddubsw m0, m3 + phaddw m0, m0 + paddw m0, m5 ; pw34 + psraw m0, 6 + packuswb m0, m0 + movd r6d, m0 + mov [dstq+dsq*0], r6w + shr r6d, 16 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w2_loop + RET +.h_w4: +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] + mova m6, [base+subpel_h_shufA] + mova m5, [base+pw_34] ; 2 + (8 << 2) + pshufd m3, m3, q0000 + movifnidn dsq, dsmp +.h_w4_loop: + movq m0, [srcq+ssq*0] ; 1 + movq m1, [srcq+ssq*1] ; 2 + lea srcq, [srcq+ssq*2] + pshufb m0, m6 ; subpel_h_shufA + pshufb m1, m6 ; subpel_h_shufA + pmaddubsw m0, m3 ; subpel_filters + pmaddubsw m1, m3 ; subpel_filters + phaddw m0, m1 + paddw m0, m5 ; pw34 + psraw m0, 6 + packuswb m0, m0 + movd [dstq+dsq*0], m0 + psrlq m0, 32 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .h_w4_loop + RET +%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] + %if ARCH_X86_32 + pshufb %2, %1, [base+subpel_h_shufB] + pshufb %3, %1, [base+subpel_h_shufC] + pshufb %1, [base+subpel_h_shufA] + %else + pshufb %2, %1, m11; subpel_h_shufB + pshufb %3, %1, m9 ; subpel_h_shufC + pshufb %1, m10 ; subpel_h_shufA + %endif + pmaddubsw %4, %2, m5 ; subpel +0 B0 + pmaddubsw %2, m6 ; subpel +4 B4 + pmaddubsw %3, m6 ; C4 + pmaddubsw %1, m5 ; A0 + paddw %3, %4 ; C4+B0 + paddw %1, %2 ; A0+B4 + phaddw %1, %3 + paddw %1, m7 ; pw34 + psraw %1, 6 +%endmacro +.h_w8: + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + PUT_8TAP_H m0, m2, m3, m4 + PUT_8TAP_H m1, m2, m3, m4 + packuswb m0, m1 +%if ARCH_X86_32 + movq [dstq], m0 + add dstq, dsm + movhps [dstq], m0 + add dstq, dsm +%else + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] +%endif + sub hd, 2 + jg .h_w8 + RET +.h_w128: + mov r4, -16*7 + jmp .h_w16_start +.h_w64: + mov r4, -16*3 + jmp .h_w16_start +.h_w32: + mov r4, -16*1 + jmp .h_w16_start +.h_w16: + xor r4d, r4d +.h_w16_start: + sub srcq, r4 + sub dstq, r4 +.h_w16_loop_v: + mov r6, r4 +.h_w16_loop_h: + movu m0, [srcq+r6+8*0] + movu m1, [srcq+r6+8*1] + PUT_8TAP_H m0, m2, m3, m4 + PUT_8TAP_H m1, m2, m3, m4 + packuswb m0, m1 + mova [dstq+r6], m0 + add r6, 16 + jle .h_w16_loop_h + add srcq, ssq + add dstq, dsmp + dec hd + jg .h_w16_loop_v + RET +.v: +%if ARCH_X86_32 + movzx mxd, ssb + shr ssd, 16 + cmp hd, 6 + cmovs ssd, mxd + movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] +%else + %assign stack_offset org_stack_offset + WIN64_SPILL_XMM 16 + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] +%endif + tzcnt r6d, wd + movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)] + punpcklwd m0, m0 + mova m7, [base+pw_512] + add r6, base_reg +%if ARCH_X86_32 + %define subpel0 [rsp+mmsize*0] + %define subpel1 [rsp+mmsize*1] + %define subpel2 [rsp+mmsize*2] + %define subpel3 [rsp+mmsize*3] +%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed + ALLOC_STACK -16*4 +%assign regs_used 7 + pshufd m1, m0, q0000 + mova subpel0, m1 + pshufd m1, m0, q1111 + mova subpel1, m1 + pshufd m1, m0, q2222 + mova subpel2, m1 + pshufd m1, m0, q3333 + mova subpel3, m1 + mov ssq, [rstk+stack_offset+gprsize*4] + lea ssq, [ssq*3] + sub srcq, ssq + mov ssq, [rstk+stack_offset+gprsize*4] + mov dsq, [rstk+stack_offset+gprsize*2] +%else + %define subpel0 m8 + %define subpel1 m9 + %define subpel2 m10 + %define subpel3 m11 + lea ss3q, [ssq*3] + pshufd m8, m0, q0000 + sub srcq, ss3q + pshufd m9, m0, q1111 + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 +%endif + jmp r6 +.v_w2: + movd m1, [srcq+ssq*0] + movd m0, [srcq+ssq*1] +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] + movd m2, [srcq+ssq*0] + movd m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m3, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +%else + movd m2, [srcq+ssq*2] + add srcq, ss3q + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m4, [srcq+ssq*2] + add srcq, ss3q +%endif + punpcklwd m1, m0 ; 0 1 + punpcklwd m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+ssq*0] + punpcklwd m2, m5 ; 2 3 + punpcklwd m5, m3 ; 3 4 + punpcklwd m3, m4 ; 4 5 + punpcklwd m4, m0 ; 5 6 + punpcklbw m2, m5 ; 23 34 + punpcklbw m3, m4 ; 45 56 +.v_w2_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m5, m1, subpel0 ; a0 b0 + mova m1, m2 + pmaddubsw m2, subpel1 ; a1 b1 + paddw m5, m2 + mova m2, m3 + pmaddubsw m3, subpel2 ; a2 b2 + paddw m5, m3 + punpcklwd m3, m0, m4 ; 6 7 + movd m0, [srcq+ssq*0] + punpcklwd m4, m0 ; 7 8 + punpcklbw m3, m4 ; 67 78 + pmaddubsw m4, m3, subpel3 ; a3 b3 + paddw m5, m4 + pmulhrsw m5, m7 + packuswb m5, m5 + movd r6d, m5 + mov [dstq+dsq*0], r6w + shr r6d, 16 + mov [dstq+dsq*1], r6w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w2_loop + RET +.v_w4: +%if ARCH_X86_32 +.v_w8: +.v_w16: +.v_w32: +.v_w64: +.v_w128: + shl wd, 14 +%if STACK_ALIGNMENT < 16 + %define dstm [rsp+mmsize*4+gprsize] + mov dstm, dstq +%endif + lea r6d, [hq+wq-(1<<16)] + mov r4, srcq +.v_w4_loop0: +%endif + movd m1, [srcq+ssq*0] + movd m0, [srcq+ssq*1] +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] + movd m2, [srcq+ssq*0] + movd m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + movd m3, [srcq+ssq*0] + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] +%else + movd m2, [srcq+ssq*2] + add srcq, ss3q + movd m5, [srcq+ssq*0] + movd m3, [srcq+ssq*1] + movd m4, [srcq+ssq*2] + add srcq, ss3q +%endif + punpckldq m1, m0 ; 0 1 + punpckldq m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+ssq*0] + punpckldq m2, m5 ; 2 3 + punpckldq m5, m3 ; 3 4 + punpckldq m3, m4 ; 4 5 + punpckldq m4, m0 ; 5 6 + punpcklbw m2, m5 ; 23 34 + punpcklbw m3, m4 ; 45 56 +.v_w4_loop: + movd m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m5, m1, subpel0 ; a0 b0 + mova m1, m2 + pmaddubsw m2, subpel1 ; a1 b1 + paddw m5, m2 + mova m2, m3 + pmaddubsw m3, subpel2 ; a2 b2 + paddw m5, m3 + punpckldq m3, m0, m4 ; 6 7 _ _ + movd m0, [srcq+ssq*0] + punpckldq m4, m0 ; 7 8 _ _ + punpcklbw m3, m4 ; 67 78 + pmaddubsw m4, m3, subpel3 ; a3 b3 + paddw m5, m4 + pmulhrsw m5, m7 + packuswb m5, m5 + movd [dstq+dsq*0], m5 + psrlq m5, 32 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w4_loop +%if ARCH_X86_32 + mov dstq, dstm + add r4, 4 + movzx hd, r6w + add dstq, 4 + mov srcq, r4 + mov dstm, dstq + sub r6d, 1<<16 + jg .v_w4_loop0 +%endif + RET +%if ARCH_X86_64 +.v_w8: +.v_w16: +.v_w32: +.v_w64: +.v_w128: + lea r6d, [wq*8-64] + mov r4, srcq + mov r7, dstq + lea r6d, [hq+r6*4] +.v_w8_loop0: + movq m1, [srcq+ssq*0] + movq m2, [srcq+ssq*1] + movq m3, [srcq+ssq*2] + add srcq, ss3q + movq m4, [srcq+ssq*0] + movq m5, [srcq+ssq*1] + movq m6, [srcq+ssq*2] + add srcq, ss3q + movq m0, [srcq+ssq*0] + punpcklbw m1, m2 ; 01 + punpcklbw m2, m3 ; 12 + punpcklbw m3, m4 ; 23 + punpcklbw m4, m5 ; 34 + punpcklbw m5, m6 ; 45 + punpcklbw m6, m0 ; 56 +.v_w8_loop: + movq m13, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddubsw m14, m1, subpel0 ; a0 + mova m1, m3 + pmaddubsw m15, m2, subpel0 ; b0 + mova m2, m4 + pmaddubsw m3, subpel1 ; a1 + mova m12, m0 + pmaddubsw m4, subpel1 ; b1 + movq m0, [srcq+ssq*0] + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + pmaddubsw m5, subpel2 ; a2 + mova m4, m6 + pmaddubsw m6, subpel2 ; b2 + punpcklbw m12, m13 ; 67 + punpcklbw m13, m0 ; 78 + paddw m14, m5 + mova m5, m12 + pmaddubsw m12, subpel3 ; a3 + paddw m15, m6 + mova m6, m13 + pmaddubsw m13, subpel3 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 + packuswb m14, m15 + movq [dstq+dsq*0], m14 + movhps [dstq+dsq*1], m14 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .v_w8_loop + add r4, 8 + add r7, 8 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 + sub r6d, 1<<8 + jg .v_w8_loop0 + RET +%endif ;ARCH_X86_64 +%undef subpel0 +%undef subpel1 +%undef subpel2 +%undef subpel3 +.hv: + %assign stack_offset org_stack_offset + cmp wd, 4 + jg .hv_w8 +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2] +%if ARCH_X86_32 + movzx mxd, ssb + shr ssd, 16 + cmp hd, 6 + cmovs ssd, mxd + movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] + mov ssq, ssmp + lea r6, [ssq*3] + sub srcq, r6 + %define base_reg r6 + mov r6, r1; use as new base + %assign regs_used 2 + ALLOC_STACK -mmsize*14 + %assign regs_used 7 + mov dsq, [rstk+stack_offset+gprsize*2] + %define subpelv0 [rsp+mmsize*0] + %define subpelv1 [rsp+mmsize*1] + %define subpelv2 [rsp+mmsize*2] + %define subpelv3 [rsp+mmsize*3] + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + pshufd m6, m0, q0000 + mova subpelv0, m6 + pshufd m6, m0, q1111 + mova subpelv1, m6 + pshufd m6, m0, q2222 + mova subpelv2, m6 + pshufd m6, m0, q3333 + mova subpelv3, m6 +%else + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] + ALLOC_STACK mmsize*14, 14 + lea ss3q, [ssq*3] + sub srcq, ss3q + %define subpelv0 m10 + %define subpelv1 m11 + %define subpelv2 m12 + %define subpelv3 m13 + punpcklbw m0, m0 + psraw m0, 8 ; sign-extend + mova m8, [base+pw_8192] + mova m9, [base+pd_512] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 +%endif + pshufd m7, m1, q0000 + cmp wd, 4 + je .hv_w4 +.hv_w2: + mova m6, [base+subpel_h_shuf4] + movq m2, [srcq+ssq*0] ; 0 + movhps m2, [srcq+ssq*1] ; 0 _ 1 +%if ARCH_X86_32 + %define w8192reg [base+pw_8192] + %define d512reg [base+pd_512] + lea srcq, [srcq+ssq*2] + movq m0, [srcq+ssq*0] ; 2 + movhps m0, [srcq+ssq*1] ; 2 _ 3 + lea srcq, [srcq+ssq*2] +%else + %define w8192reg m8 + %define d512reg m9 + movq m0, [srcq+ssq*2] ; 2 + add srcq, ss3q + movhps m0, [srcq+ssq*0] ; 2 _ 3 +%endif + pshufb m2, m6 ; 0 ~ 1 ~ + pshufb m0, m6 ; 2 ~ 3 ~ + pmaddubsw m2, m7 ; subpel_filters + pmaddubsw m0, m7 ; subpel_filters + phaddw m2, m0 ; 0 1 2 3 + pmulhrsw m2, w8192reg +%if ARCH_X86_32 + movq m3, [srcq+ssq*0] ; 4 + movhps m3, [srcq+ssq*1] ; 4 _ 5 + lea srcq, [srcq+ssq*2] +%else + movq m3, [srcq+ssq*1] ; 4 + movhps m3, [srcq+ssq*2] ; 4 _ 5 + add srcq, ss3q +%endif + movq m0, [srcq+ssq*0] ; 6 + pshufb m3, m6 ; 4 ~ 5 ~ + pshufb m0, m6 ; 6 ~ + pmaddubsw m3, m7 ; subpel_filters + pmaddubsw m0, m7 ; subpel_filters + phaddw m3, m0 ; 4 5 6 _ + pmulhrsw m3, w8192reg + palignr m4, m3, m2, 4; V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 + punpckhwd m2, m4 ; V 23 34 2 3 3 4 + pshufd m0, m3, q2121; V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 4 5 5 6 +.hv_w2_loop: + movq m4, [srcq+ssq*1] ; V 7 + lea srcq, [srcq+ssq*2] ; V + movhps m4, [srcq+ssq*0] ; V 7 8 + pshufb m4, m6 + pmaddubsw m4, m7 + pmaddwd m5, m1, subpelv0; V a0 b0 + mova m1, m2 ; V + pmaddwd m2, subpelv1 ; V a1 b1 + paddd m5, m2 ; V + mova m2, m3 ; V + pmaddwd m3, subpelv2 ; a2 b2 + phaddw m4, m4 + pmulhrsw m4, w8192reg + paddd m5, m3 ; V + palignr m3, m4, m0, 12 + mova m0, m4 + punpcklwd m3, m0 ; V 67 78 + pmaddwd m4, m3, subpelv3 ; V a3 b3 + paddd m5, d512reg + paddd m5, m4 + psrad m5, 10 + packssdw m5, m5 + packuswb m5, m5 + movd r4d, m5 + mov [dstq+dsq*0], r4w + shr r4d, 16 + mov [dstq+dsq*1], r4w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .hv_w2_loop + RET +%undef w8192reg +%undef d512reg +.hv_w4: +%define hv4_line_0_0 4 +%define hv4_line_0_1 5 +%define hv4_line_0_2 6 +%define hv4_line_0_3 7 +%define hv4_line_0_4 8 +%define hv4_line_0_5 9 +%define hv4_line_1_0 10 +%define hv4_line_1_1 11 +%define hv4_line_1_2 12 +%define hv4_line_1_3 13 +%macro SAVELINE_W4 3 + mova [rsp+mmsize*hv4_line_%3_%2], %1 +%endmacro +%macro RESTORELINE_W4 3 + mova %1, [rsp+mmsize*hv4_line_%3_%2] +%endmacro +%if ARCH_X86_32 + %define w8192reg [base+pw_8192] + %define d512reg [base+pd_512] +%else + %define w8192reg m8 + %define d512reg m9 +%endif + ; lower shuffle 0 1 2 3 4 + mova m6, [base+subpel_h_shuf4] + movq m5, [srcq+ssq*0] ; 0 _ _ _ + movhps m5, [srcq+ssq*1] ; 0 _ 1 _ +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] + movq m4, [srcq+ssq*0] ; 2 _ _ _ + movhps m4, [srcq+ssq*1] ; 2 _ 3 _ + lea srcq, [srcq+ssq*2] +%else + movq m4, [srcq+ssq*2] ; 2 _ _ _ + movhps m4, [srcq+ss3q ] ; 2 _ 3 _ + lea srcq, [srcq+ssq*4] +%endif + pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ + pmaddubsw m2, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m2, m0 ;H 0 1 2 3 + pmulhrsw m2, w8192reg ;H pw_8192 + SAVELINE_W4 m2, 2, 0 + ; upper shuffle 2 3 4 5 6 + mova m6, [base+subpel_h_shuf4+16] + pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ + pmaddubsw m2, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m2, m0 ;H 0 1 2 3 + pmulhrsw m2, w8192reg ;H pw_8192 + ; + ; lower shuffle + mova m6, [base+subpel_h_shuf4] + movq m5, [srcq+ssq*0] ; 4 _ _ _ + movhps m5, [srcq+ssq*1] ; 4 _ 5 _ +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] + movq m4, [srcq+ssq*0] ; 6 _ _ _ + add srcq, ssq +%else + movq m4, [srcq+ssq*2] ; 6 _ _ _ + add srcq, ss3q +%endif + pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ + pmaddubsw m3, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m3, m0 ;H 4 5 6 7 + pmulhrsw m3, w8192reg ;H pw_8192 + SAVELINE_W4 m3, 3, 0 + ; upper shuffle + mova m6, [base+subpel_h_shuf4+16] + pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ + pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ + pmaddubsw m3, m7 ;H subpel_filters + pmaddubsw m0, m7 ;H subpel_filters + phaddw m3, m0 ;H 4 5 6 7 + pmulhrsw m3, w8192reg ;H pw_8192 + ;process high + palignr m4, m3, m2, 4;V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 + punpckhwd m2, m4 ; V 23 34 + pshufd m0, m3, q2121;V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 + SAVELINE_W4 m0, 0, 1 + SAVELINE_W4 m1, 1, 1 + SAVELINE_W4 m2, 2, 1 + SAVELINE_W4 m3, 3, 1 + ;process low + RESTORELINE_W4 m2, 2, 0 + RESTORELINE_W4 m3, 3, 0 + palignr m4, m3, m2, 4;V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 + punpckhwd m2, m4 ; V 23 34 + pshufd m0, m3, q2121;V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 +.hv_w4_loop: + ;process low + pmaddwd m5, m1, subpelv0 ; V a0 b0 + mova m1, m2 + pmaddwd m2, subpelv1; V a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, subpelv2; V a2 b2 + paddd m5, m3 + mova m6, [base+subpel_h_shuf4] + movq m4, [srcq+ssq*0] ; 7 + movhps m4, [srcq+ssq*1] ; 7 _ 8 _ + pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ + pmaddubsw m4, m7 ;H subpel_filters + phaddw m4, m4 ;H 7 8 7 8 + pmulhrsw m4, w8192reg ;H pw_8192 + palignr m3, m4, m0, 12 ; 6 7 8 7 + mova m0, m4 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m3, subpelv3; a3 b3 + paddd m5, d512reg ; pd_512 + paddd m5, m4 + psrad m5, 10 + SAVELINE_W4 m0, 0, 0 + SAVELINE_W4 m1, 1, 0 + SAVELINE_W4 m2, 2, 0 + SAVELINE_W4 m3, 3, 0 + SAVELINE_W4 m5, 5, 0 + ;process high + RESTORELINE_W4 m0, 0, 1 + RESTORELINE_W4 m1, 1, 1 + RESTORELINE_W4 m2, 2, 1 + RESTORELINE_W4 m3, 3, 1 + pmaddwd m5, m1, subpelv0; V a0 b0 + mova m1, m2 + pmaddwd m2, subpelv1; V a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, subpelv2; V a2 b2 + paddd m5, m3 + mova m6, [base+subpel_h_shuf4+16] + movq m4, [srcq+ssq*0] ; 7 + movhps m4, [srcq+ssq*1] ; 7 _ 8 _ + lea srcq, [srcq+ssq*2] + pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ + pmaddubsw m4, m7 ;H subpel_filters + phaddw m4, m4 ;H 7 8 7 8 + pmulhrsw m4, w8192reg ;H pw_8192 + palignr m3, m4, m0, 12 ; 6 7 8 7 + mova m0, m4 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m3, subpelv3; a3 b3 + paddd m5, d512reg ; pd_512 + paddd m5, m4 + psrad m4, m5, 10 + RESTORELINE_W4 m5, 5, 0 + packssdw m5, m4 ; d -> w + packuswb m5, m5 ; w -> b + pshuflw m5, m5, q3120 + movd [dstq+dsq*0], m5 + psrlq m5, 32 + movd [dstq+dsq*1], m5 + lea dstq, [dstq+dsq*2] + sub hd, 2 + SAVELINE_W4 m0, 0, 1 + SAVELINE_W4 m1, 1, 1 + SAVELINE_W4 m2, 2, 1 + SAVELINE_W4 m3, 3, 1 + RESTORELINE_W4 m0, 0, 0 + RESTORELINE_W4 m1, 1, 0 + RESTORELINE_W4 m2, 2, 0 + RESTORELINE_W4 m3, 3, 0 + jg .hv_w4_loop + RET +%undef subpelv0 +%undef subpelv1 +%undef subpelv2 +%undef subpelv3 +.hv_w8: + %assign stack_offset org_stack_offset +%define hv8_line_1 0 +%define hv8_line_2 1 +%define hv8_line_3 2 +%define hv8_line_4 3 +%define hv8_line_6 4 +%macro SAVELINE_W8 2 + mova [rsp+hv8_line_%1*mmsize], %2 +%endmacro +%macro RESTORELINE_W8 2 + mova %2, [rsp+hv8_line_%1*mmsize] +%endmacro + shr mxd, 16 + sub srcq, 3 +%if ARCH_X86_32 + %define base_reg r1 + %define subpelh0 [rsp+mmsize*5] + %define subpelh1 [rsp+mmsize*6] + %define subpelv0 [rsp+mmsize*7] + %define subpelv1 [rsp+mmsize*8] + %define subpelv2 [rsp+mmsize*9] + %define subpelv3 [rsp+mmsize*10] + %define accuv0 [rsp+mmsize*11] + %define accuv1 [rsp+mmsize*12] + movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3] + movzx mxd, ssb + shr ssd, 16 + cmp hd, 6 + cmovs ssd, mxd + movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3] + mov ssq, ssmp + ALLOC_STACK -mmsize*13 +%if STACK_ALIGNMENT < 16 + %define dstm [rsp+mmsize*13+gprsize*1] + %define dsm [rsp+mmsize*13+gprsize*2] + mov r6, [rstk+stack_offset+gprsize*2] + mov dsm, r6 +%endif + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + punpcklbw m5, m5 + psraw m5, 8 ; sign-extend + pshufd m2, m5, q0000 + pshufd m3, m5, q1111 + pshufd m4, m5, q2222 + pshufd m5, m5, q3333 + mova subpelh0, m0 + mova subpelh1, m1 + mova subpelv0, m2 + mova subpelv1, m3 + mova subpelv2, m4 + mova subpelv3, m5 + lea r6, [ssq*3] + mov dstm, dstq + sub srcq, r6 +%else + ALLOC_STACK 16*5, 16 + %define subpelh0 m10 + %define subpelh1 m11 + %define subpelv0 m12 + %define subpelv1 m13 + %define subpelv2 m14 + %define subpelv3 m15 + %define accuv0 m8 + %define accuv1 m9 + movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] + pshufd subpelh0, m0, q0000 + pshufd subpelh1, m0, q1111 + punpcklbw m1, m1 + psraw m1, 8 ; sign-extend + pshufd subpelv0, m1, q0000 + pshufd subpelv1, m1, q1111 + pshufd subpelv2, m1, q2222 + pshufd subpelv3, m1, q3333 + lea ss3q, [ssq*3] + mov r7, dstq + sub srcq, ss3q +%endif + shl wd, 14 + lea r6d, [hq+wq-(1<<16)] + mov r4, srcq +.hv_w8_loop0: + movu m4, [srcq+ssq*0] ; 0 = _ _ + movu m5, [srcq+ssq*1] ; 1 = _ _ +%if ARCH_X86_32 + lea srcq, [srcq+ssq*2] +%endif +%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] + %if ARCH_X86_32 + pshufb %3, %1, [base+subpel_h_shufB] + pshufb %4, %1, [base+subpel_h_shufC] + pshufb %1, [base+subpel_h_shufA] + %else + pshufb %3, %1, %6 ; subpel_h_shufB + pshufb %4, %1, %7 ; subpel_h_shufC + pshufb %1, %5 ; subpel_h_shufA + %endif + pmaddubsw %2, %3, subpelh0 ; subpel +0 C0 + pmaddubsw %4, subpelh1; subpel +4 B4 + pmaddubsw %3, subpelh1; C4 + pmaddubsw %1, subpelh0; A0 + paddw %2, %4 ; C0+B4 + paddw %1, %3 ; A0+C4 + phaddw %1, %2 +%endmacro +%if ARCH_X86_64 + mova m7, [base+subpel_h_shufA] + mova m8, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] +%endif + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ +%if ARCH_X86_32 + movu m6, [srcq+ssq*0] ; 2 = _ _ + movu m0, [srcq+ssq*1] ; 3 = _ _ + lea srcq, [srcq+ssq*2] +%else + movu m6, [srcq+ssq*2] ; 2 = _ _ + add srcq, ss3q + movu m0, [srcq+ssq*0] ; 3 = _ _ +%endif + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ + HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ + mova m7, [base+pw_8192] + pmulhrsw m4, m7 ; H pw_8192 + pmulhrsw m5, m7 ; H pw_8192 + pmulhrsw m6, m7 ; H pw_8192 + pmulhrsw m0, m7 ; H pw_8192 + punpcklwd m1, m4, m5 ; 0 1 ~ + punpcklwd m2, m5, m6 ; 1 2 ~ + punpcklwd m3, m6, m0 ; 2 3 ~ + SAVELINE_W8 1, m1 + SAVELINE_W8 2, m2 + SAVELINE_W8 3, m3 + mova m7, [base+subpel_h_shufA] +%if ARCH_X86_32 + movu m4, [srcq+ssq*0] ; 4 = _ _ + movu m5, [srcq+ssq*1] ; 5 = _ _ + lea srcq, [srcq+ssq*2] +%else + movu m4, [srcq+ssq*1] ; 4 = _ _ + movu m5, [srcq+ssq*2] ; 5 = _ _ + add srcq, ss3q +%endif + movu m6, [srcq+ssq*0] ; 6 = _ _ + HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ + HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ + HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ + mova m7, [base+pw_8192] + pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ + pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ + pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ + punpcklwd m4, m0, m1 ; 3 4 ~ + punpcklwd m5, m1, m2 ; 4 5 ~ + punpcklwd m6, m2, m3 ; 5 6 ~ + SAVELINE_W8 6, m3 + RESTORELINE_W8 1, m1 + RESTORELINE_W8 2, m2 + RESTORELINE_W8 3, m3 +.hv_w8_loop: + ; m8 accu for V a + ; m9 accu for V b + SAVELINE_W8 1, m3 + SAVELINE_W8 2, m4 + SAVELINE_W8 3, m5 + SAVELINE_W8 4, m6 +%if ARCH_X86_32 + pmaddwd m0, m1, subpelv0 ; a0 + pmaddwd m7, m2, subpelv0 ; b0 + pmaddwd m3, subpelv1 ; a1 + pmaddwd m4, subpelv1 ; b1 + paddd m0, m3 + paddd m7, m4 + pmaddwd m5, subpelv2 ; a2 + pmaddwd m6, subpelv2 ; b2 + paddd m0, m5 + paddd m7, m6 + mova m5, [base+pd_512] + paddd m0, m5 ; pd_512 + paddd m7, m5 ; pd_512 + mova accuv0, m0 + mova accuv1, m7 +%else + pmaddwd m8, m1, subpelv0 ; a0 + pmaddwd m9, m2, subpelv0 ; b0 + pmaddwd m3, subpelv1 ; a1 + pmaddwd m4, subpelv1 ; b1 + paddd m8, m3 + paddd m9, m4 + pmaddwd m5, subpelv2 ; a2 + pmaddwd m6, subpelv2 ; b2 + paddd m8, m5 + paddd m9, m6 + mova m7, [base+pd_512] + paddd m8, m7 ; pd_512 + paddd m9, m7 ; pd_512 + mova m7, [base+subpel_h_shufB] + mova m6, [base+subpel_h_shufC] + mova m5, [base+subpel_h_shufA] +%endif + movu m0, [srcq+ssq*1] ; 7 + movu m4, [srcq+ssq*2] ; 8 + lea srcq, [srcq+ssq*2] + HV_H_W8 m0, m1, m2, m3, m5, m7, m6 + HV_H_W8 m4, m1, m2, m3, m5, m7, m6 + mova m5, [base+pw_8192] + pmulhrsw m0, m5 ; H pw_8192 + pmulhrsw m4, m5 ; H pw_8192 + RESTORELINE_W8 6, m6 + punpcklwd m5, m6, m0 ; 6 7 ~ + punpcklwd m6, m0, m4 ; 7 8 ~ + pmaddwd m1, m5, subpelv3 ; a3 + paddd m2, m1, accuv0 + pmaddwd m1, m6, subpelv3 ; b3 + paddd m1, m1, accuv1 ; H + V + psrad m2, 10 + psrad m1, 10 + packssdw m2, m1 ; d -> w + packuswb m2, m1 ; w -> b + movd [dstq+dsq*0], m2 + psrlq m2, 32 +%if ARCH_X86_32 + add dstq, dsm + movd [dstq+dsq*0], m2 + add dstq, dsm +%else + movd [dstq+dsq*1], m2 + lea dstq, [dstq+dsq*2] +%endif + sub hd, 2 + jle .hv_w8_outer + SAVELINE_W8 6, m4 + RESTORELINE_W8 1, m1 + RESTORELINE_W8 2, m2 + RESTORELINE_W8 3, m3 + RESTORELINE_W8 4, m4 + jmp .hv_w8_loop +.hv_w8_outer: +%if ARCH_X86_32 + mov dstq, dstm + add r4, 4 + movzx hd, r6w + add dstq, 4 + mov srcq, r4 + mov dstm, dstq +%else + add r4, 4 + add r7, 4 + movzx hd, r6b + mov srcq, r4 + mov dstq, r7 +%endif + sub r6d, 1<<16 + jg .hv_w8_loop0 + RET + +%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask + %if cpuflag(ssse3) + pshufb %1, %2 + %else + %if %5 == 1 + pcmpeqd %2, %2 + psrlq %2, 32 + %endif + psrldq %3, %1, 1 + pshufd %3, %3, q2301 + pand %1, %2 + pandn %4, %2, %3 + por %1, %4 + %endif +%endmacro + +%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask + %ifnidn %1, %2 + mova %1, %2 + %endif + PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 +%endmacro + +%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask + %if notcpuflag(ssse3) + psrlq %1, %2, 16 + %elifnidn %1, %2 + mova %1, %2 + %endif + PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 +%endmacro + +%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp] + %if cpuflag(ssse3) + palignr %1, %2, %3, %4 + %else + %if %0 == 4 + %assign %%i regnumof%+%1 + 1 + %define %%tmp m %+ %%i + %else + %define %%tmp %5 + %endif + psrldq %1, %3, %4 + pslldq %%tmp, %2, 16-%4 + por %1, %%tmp + %endif +%endmacro + +%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 + %if cpuflag(ssse3) + phaddw %1, %2 + %elifnidn %1, %2 + %if %4 == 1 + mova %3, [base+pw_1] + %endif + pmaddwd %1, %3 + pmaddwd %2, %3 + packssdw %1, %2 + %else + %if %4 == 1 + pmaddwd %1, [base+pw_1] + %else + pmaddwd %1, %3 + %endif + packssdw %1, %1 + %endif +%endmacro + +%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift + %if cpuflag(ssse3) + pmulhrsw %1, %2, %3 + %else + paddw %1, %2, %3 + psraw %1, %4 + %endif +%endmacro + +%macro PMULHRSW_8192 3 ; dst, src1, src2 + PMULHRSW_POW2 %1, %2, %3, 2 +%endmacro + +%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2] + movd %1, [%2+0] + movd %3, [%2+1] + movd %4, [%2+2] + movd %5, [%2+3] + punpckldq %1, %3 + punpckldq %4, %5 + punpcklqdq %1, %4 +%endmacro + +%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc + %if cpuflag(ssse3) + movu m%1, [%2] + pshufb m2, m%1, m11 ; subpel_h_shufB + pshufb m3, m%1, m9 ; subpel_h_shufC + pshufb m%1, m10 ; subpel_h_shufA + %else + %if ARCH_X86_64 + SWAP m12, m5 + SWAP m13, m6 + SWAP m14, m7 + %define %%mx0 m%+%%i + %define %%mx1 m%+%%j + %assign %%i 0 + %rep 12 + movd %%mx0, [%2+%%i] + %assign %%i %%i+1 + %endrep + %assign %%i 0 + %rep 6 + %assign %%j %%i+1 + punpckldq %%mx0, %%mx1 + %assign %%i %%i+2 + %endrep + %assign %%i 0 + %rep 3 + %assign %%j %%i+2 + punpcklqdq %%mx0, %%mx1 + %assign %%i %%i+4 + %endrep + SWAP m%1, m0 + SWAP m2, m4 + SWAP m3, m8 + SWAP m5, m12 + SWAP m6, m13 + SWAP m7, m14 + %else + PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7 + PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7 + PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7 + SWAP m%1, m0 + %endif + %endif +%endmacro + +%macro PREP_8TAP_H 2 ; dst, src_memloc + PREP_8TAP_H_LOAD %1, %2 + %if ARCH_X86_64 && notcpuflag(ssse3) + SWAP m8, m1 + SWAP m9, m7 + %endif + %xdefine mX m%+%1 + %assign %%i regnumof%+mX + %define mX m%+%%i + mova m4, m2 + PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0 + PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4 + PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4 + PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0 + %undef mX + %if ARCH_X86_64 && notcpuflag(ssse3) + SWAP m1, m8 + SWAP m7, m9 + %endif + paddw m3, m4 + paddw m%1, m2 + PHADDW m%1, m3, m15, ARCH_X86_32 + %if ARCH_X86_64 || cpuflag(ssse3) + PMULHRSW_8192 m%1, m%1, m7 + %else + PMULHRSW_8192 m%1, m%1, [base+pw_2] + %endif +%endmacro + +%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2] + %if cpuflag(ssse3) + movu %1, [%2] + pshufb m2, %1, shufB + pshufb m3, %1, shufC + pshufb %1, shufA + %else + PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4 + PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4 + PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4 + %endif + mova m1, m2 + PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0 + PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4 + PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4 + PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0 + paddw m1, m3 ; C0+B4 + paddw %1, m2 ; A0+C4 + PHADDW %1, m1, %3, 1 +%endmacro + +%macro PREP_8TAP 0 +%if ARCH_X86_32 + DECLARE_REG_TMP 1, 2 +%elif WIN64 + DECLARE_REG_TMP 6, 4 +%else + DECLARE_REG_TMP 6, 7 +%endif + +FN prep_8tap, sharp, SHARP, SHARP +FN prep_8tap, sharp_smooth, SHARP, SMOOTH +FN prep_8tap, smooth_sharp, SMOOTH, SHARP +FN prep_8tap, smooth, SMOOTH, SMOOTH +FN prep_8tap, sharp_regular, SHARP, REGULAR +FN prep_8tap, regular_sharp, REGULAR, SHARP +FN prep_8tap, smooth_regular, SMOOTH, REGULAR +FN prep_8tap, regular_smooth, REGULAR, SMOOTH +FN prep_8tap, regular, REGULAR, REGULAR + +%if ARCH_X86_32 + %define base_reg r2 + %define base base_reg-prep%+SUFFIX +%else + %define base_reg r7 + %define base 0 +%endif +cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 +%assign org_stack_offset stack_offset + imul mxd, mxm, 0x010101 + add mxd, t0d ; 8tap_h, mx, 4tap_h + imul myd, mym, 0x010101 + add myd, t1d ; 8tap_v, my, 4tap_v + mov wd, wm + movifnidn srcd, srcm + movifnidn hd, hm + test mxd, 0xf00 + jnz .h + test myd, 0xf00 + jnz .v + LEA base_reg, prep_ssse3 + tzcnt wd, wd + movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] + pxor m4, m4 + add wq, base_reg + movifnidn strided, stridem + lea r6, [strideq*3] + %assign stack_offset org_stack_offset +%if WIN64 + pop r8 + pop r7 +%endif + jmp wq +.h: + LEA base_reg, prep%+SUFFIX + test myd, 0xf00 + jnz .hv +%if cpuflag(ssse3) + WIN64_SPILL_XMM 12 +%else + WIN64_SPILL_XMM 16 +%endif +%if ARCH_X86_32 + %define strideq r6 + mov strideq, stridem +%endif + cmp wd, 4 + je .h_w4 + tzcnt wd, wd +%if cpuflag(ssse3) + %if ARCH_X86_64 + mova m10, [base+subpel_h_shufA] + mova m11, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] + %else + %define m10 [base+subpel_h_shufA] + %define m11 [base+subpel_h_shufB] + %define m9 [base+subpel_h_shufC] + %endif +%endif + shr mxd, 16 + sub srcq, 3 + movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] + movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] +%if cpuflag(ssse3) + mova m7, [base+pw_8192] + pshufd m5, m6, q0000 + pshufd m6, m6, q1111 +%else + punpcklbw m6, m6 + psraw m6, 8 + %if ARCH_X86_64 + mova m7, [pw_2] + mova m15, [pw_1] + %else + %define m15 m4 + %endif + pshufd m5, m6, q1010 + punpckhqdq m6, m6 +%endif + add wq, base_reg + jmp wq +.h_w4: +%if ARCH_X86_32 + and mxd, 0x7f +%else + movzx mxd, mxb +%endif + dec srcq + movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] +%if cpuflag(ssse3) + mova m6, [base+pw_8192] + mova m5, [base+subpel_h_shufA] + pshufd m4, m4, q0000 +%else + mova m6, [base+pw_2] + %if ARCH_X86_64 + mova m14, [pw_1] + %else + %define m14 m7 + %endif + punpcklbw m4, m4 + psraw m4, 8 + punpcklqdq m4, m4 +%endif +%if ARCH_X86_64 + lea stride3q, [strideq*3] +%endif +.h_w4_loop: +%if cpuflag(ssse3) + movq m0, [srcq+strideq*0] ; 0 + movq m1, [srcq+strideq*1] ; 1 + %if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movq m2, [srcq+strideq*0] ; 2 + movq m3, [srcq+strideq*1] ; 3 + lea srcq, [srcq+strideq*2] + %else + movq m2, [srcq+strideq*2] ; 2 + movq m3, [srcq+stride3q ] ; 3 + lea srcq, [srcq+strideq*4] + %endif + pshufb m0, m5 + pshufb m1, m5 + pshufb m2, m5 + pshufb m3, m5 +%elif ARCH_X86_64 + movd m0, [srcq+strideq*0+0] + movd m12, [srcq+strideq*0+1] + movd m1, [srcq+strideq*1+0] + movd m5, [srcq+strideq*1+1] + movd m2, [srcq+strideq*2+0] + movd m13, [srcq+strideq*2+1] + movd m3, [srcq+stride3q +0] + movd m7, [srcq+stride3q +1] + punpckldq m0, m12 + punpckldq m1, m5 + punpckldq m2, m13 + punpckldq m3, m7 + movd m12, [srcq+strideq*0+2] + movd m8, [srcq+strideq*0+3] + movd m5, [srcq+strideq*1+2] + movd m9, [srcq+strideq*1+3] + movd m13, [srcq+strideq*2+2] + movd m10, [srcq+strideq*2+3] + movd m7, [srcq+stride3q +2] + movd m11, [srcq+stride3q +3] + lea srcq, [srcq+strideq*4] + punpckldq m12, m8 + punpckldq m5, m9 + punpckldq m13, m10 + punpckldq m7, m11 + punpcklqdq m0, m12 ; 0 + punpcklqdq m1, m5 ; 1 + punpcklqdq m2, m13 ; 2 + punpcklqdq m3, m7 ; 3 +%else + movd m0, [srcq+strideq*0+0] + movd m1, [srcq+strideq*0+1] + movd m2, [srcq+strideq*0+2] + movd m3, [srcq+strideq*0+3] + punpckldq m0, m1 + punpckldq m2, m3 + punpcklqdq m0, m2 ; 0 + movd m1, [srcq+strideq*1+0] + movd m2, [srcq+strideq*1+1] + movd m3, [srcq+strideq*1+2] + movd m7, [srcq+strideq*1+3] + lea srcq, [srcq+strideq*2] + punpckldq m1, m2 + punpckldq m3, m7 + punpcklqdq m1, m3 ; 1 + movd m2, [srcq+strideq*0+0] + movd m3, [srcq+strideq*0+1] + movd m7, [srcq+strideq*0+2] + movd m5, [srcq+strideq*0+3] + punpckldq m2, m3 + punpckldq m7, m5 + punpcklqdq m2, m7 ; 2 + movd m3, [srcq+strideq*1+0] + movd m7, [srcq+strideq*1+1] + punpckldq m3, m7 + movd m7, [srcq+strideq*1+2] + movd m5, [srcq+strideq*1+3] + lea srcq, [srcq+strideq*2] + punpckldq m7, m5 + punpcklqdq m3, m7 ; 3 +%endif + PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 + PMADDUBSW m1, m4, m5, m7, 0 + PMADDUBSW m2, m4, m5, m7, 0 + PMADDUBSW m3, m4, m5, m7, 0 + PHADDW m0, m1, m14, ARCH_X86_32 + PHADDW m2, m3, m14, 0 + PMULHRSW_8192 m0, m0, m6 + PMULHRSW_8192 m2, m2, m6 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m2 + add tmpq, 32 + sub hd, 4 + jg .h_w4_loop + RET +.h_w8: +%if cpuflag(ssse3) + PREP_8TAP_H 0, srcq+strideq*0 + PREP_8TAP_H 1, srcq+strideq*1 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + lea srcq, [srcq+strideq*2] + add tmpq, 32 + sub hd, 2 +%else + PREP_8TAP_H 0, srcq + mova [tmpq], m0 + add srcq, strideq + add tmpq, 16 + dec hd +%endif + jg .h_w8 + RET +.h_w16: + mov r3, -16*1 + jmp .h_start +.h_w32: + mov r3, -16*2 + jmp .h_start +.h_w64: + mov r3, -16*4 + jmp .h_start +.h_w128: + mov r3, -16*8 +.h_start: + sub srcq, r3 + mov r5, r3 +.h_loop: +%if cpuflag(ssse3) + PREP_8TAP_H 0, srcq+r3+8*0 + PREP_8TAP_H 1, srcq+r3+8*1 + mova [tmpq+16*0], m0 + mova [tmpq+16*1], m1 + add tmpq, 32 + add r3, 16 +%else + PREP_8TAP_H 0, srcq+r3 + mova [tmpq], m0 + add tmpq, 16 + add r3, 8 +%endif + jl .h_loop + add srcq, strideq + mov r3, r5 + dec hd + jg .h_loop + RET +.v: + LEA base_reg, prep%+SUFFIX +%if ARCH_X86_32 + mov mxd, myd + and mxd, 0x7f +%else + %assign stack_offset org_stack_offset + WIN64_SPILL_XMM 16 + movzx mxd, myb +%endif + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] +%if cpuflag(ssse3) + mova m2, [base+pw_512] + mova m7, [base+pw_8192] + punpcklwd m0, m0 +%else + punpcklbw m0, m0 + psraw m0, 8 +%endif +%if ARCH_X86_32 + %define subpel0 [rsp+mmsize*0] + %define subpel1 [rsp+mmsize*1] + %define subpel2 [rsp+mmsize*2] + %define subpel3 [rsp+mmsize*3] +%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed + %if cpuflag(ssse3) + ALLOC_STACK -mmsize*4 + %else + ALLOC_STACK -mmsize*5 + %endif +%assign regs_used 7 + mov strideq, [rstk+stack_offset+gprsize*3] + pshufd m1, m0, q0000 + mova subpel0, m1 + pshufd m1, m0, q1111 + mova subpel1, m1 + lea r5, [strideq*3] + pshufd m1, m0, q2222 + mova subpel2, m1 + pshufd m1, m0, q3333 + mova subpel3, m1 + sub srcq, r5 +%else + %define subpel0 m8 + %define subpel1 m9 + %define subpel2 m10 + %define subpel3 m11 + pshufd m8, m0, q0000 + pshufd m9, m0, q1111 + lea stride3q, [strideq*3] + pshufd m10, m0, q2222 + pshufd m11, m0, q3333 + sub srcq, stride3q + cmp wd, 8 + jns .v_w8 +%endif +.v_w4: +%if notcpuflag(ssse3) + pxor m6, m6 + %if ARCH_X86_64 + mova m7, [base+pw_2] + %endif +%endif +%if ARCH_X86_32 + %if STACK_ALIGNMENT < mmsize + %define srcm [esp+stack_size+gprsize*1] + %define tmpm [esp+stack_size+gprsize*2] + %endif + mov tmpm, tmpq + mov srcm, srcq + lea r5d, [wq - 4] ; horizontal loop + shl r5d, (16 - 2) ; (wq / 4) << 16 + mov r5w, hw +.v_w4_loop0: +%endif + movd m1, [srcq+strideq*0] + movd m0, [srcq+strideq*1] +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movd m2, [srcq+strideq*0] + movd m4, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movd m3, [srcq+strideq*0] + movd m5, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] +%else + movd m2, [srcq+strideq*2] + add srcq, stride3q + movd m4, [srcq+strideq*0] + movd m3, [srcq+strideq*1] + movd m5, [srcq+strideq*2] + add srcq, stride3q +%endif + punpckldq m1, m0 ; 0 1 + punpckldq m0, m2 ; 1 2 + punpcklbw m1, m0 ; 01 12 + movd m0, [srcq+strideq*0] + punpckldq m2, m4 ; 2 3 + punpckldq m4, m3 ; 3 4 + punpckldq m3, m5 ; 4 5 + punpckldq m5, m0 ; 5 6 + punpcklbw m2, m4 ; 23 34 + punpcklbw m3, m5 ; 45 56 +.v_w4_loop: +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel0 + %define subpel0 m7 +%endif + mova m5, m1 + PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0 +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel1 + %define subpel1 m7 +%endif + mova m1, m2 + PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1 + paddw m5, m2 +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel2 + %define subpel2 m7 +%endif + mova m2, m3 + PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2 + movd m4, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + paddw m5, m3 + punpckldq m3, m0, m4 ; 6 7 _ _ + movd m0, [srcq+strideq*0] + punpckldq m4, m0 ; 7 8 _ _ + punpcklbw m3, m4 ; 67 78 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m12, m0 + %else + mova [esp+mmsize*4], m0 + mova m7, subpel3 + %define subpel3 m7 + %endif +%endif + mova m4, m3 + PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3 + paddw m5, m4 +%if ARCH_X86_64 || cpuflag(ssse3) + %if notcpuflag(ssse3) + SWAP m0, m12 + %endif + PMULHRSW_8192 m5, m5, m7 +%else + mova m0, [esp+mmsize*4] + PMULHRSW_8192 m5, m5, [base+pw_2] +%endif + movq [tmpq+wq*0], m5 + movhps [tmpq+wq*2], m5 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w4_loop +%if ARCH_X86_32 + mov srcq, srcm + mov tmpq, tmpm + movzx hd, r5w + add srcq, 4 + add tmpq, 8 + mov srcm, srcq + mov tmpm, tmpq + sub r5d, 1<<16 ; horizontal-- + jg .v_w4_loop0 +%endif + RET +%if ARCH_X86_64 +.v_w8: + lea r6d, [wq*8-64] + mov r5, srcq + mov r8, tmpq + lea r6d, [hq+r6*4] +.v_w8_loop0: + movq m1, [srcq+strideq*0] + movq m2, [srcq+strideq*1] + movq m3, [srcq+strideq*2] + add srcq, stride3q + movq m4, [srcq+strideq*0] + movq m5, [srcq+strideq*1] + movq m6, [srcq+strideq*2] + add srcq, stride3q + movq m0, [srcq+strideq*0] + punpcklbw m1, m2 ; 01 + punpcklbw m2, m3 ; 12 + punpcklbw m3, m4 ; 23 + punpcklbw m4, m5 ; 34 + punpcklbw m5, m6 ; 45 + punpcklbw m6, m0 ; 56 +.v_w8_loop: + movq m13, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] +%if cpuflag(ssse3) + pmaddubsw m14, m1, subpel0 ; a0 + pmaddubsw m15, m2, subpel0 ; b0 + mova m1, m3 + mova m2, m4 + pmaddubsw m3, subpel1 ; a1 + pmaddubsw m4, subpel1 ; b1 + paddw m14, m3 + paddw m15, m4 + mova m3, m5 + mova m4, m6 + pmaddubsw m5, subpel2 ; a2 + pmaddubsw m6, subpel2 ; b2 + punpcklbw m12, m0, m13 ; 67 + movq m0, [srcq+strideq*0] + punpcklbw m13, m0 ; 78 + paddw m14, m5 + mova m5, m12 + pmaddubsw m12, subpel3 ; a3 + paddw m15, m6 + mova m6, m13 + pmaddubsw m13, subpel3 ; b3 + paddw m14, m12 + paddw m15, m13 + pmulhrsw m14, m7 + pmulhrsw m15, m7 +%else + mova m14, m1 + PMADDUBSW m14, subpel0, m7, m12, 1 ; a0 + mova m15, m2 + PMADDUBSW m15, subpel0, m7, m12, 0 ; b0 + mova m1, m3 + PMADDUBSW m3, subpel1, m7, m12, 0 ; a1 + mova m2, m4 + PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 + paddw m14, m3 + mova m3, m5 + PMADDUBSW m5, subpel2, m7, m12, 0 ; a2 + paddw m15, m4 + mova m4, m6 + PMADDUBSW m6, subpel2, m7, m12, 0 ; b2 + paddw m15, m6 + punpcklbw m12, m0, m13 ; 67 + movq m0, [srcq+strideq*0] + punpcklbw m13, m0 ; 78 + paddw m14, m5 + mova m5, m12 + PMADDUBSW m12, subpel3, m7, m6, 0 ; a3 + paddw m14, m12 + mova m6, m13 + PMADDUBSW m13, subpel3, m7, m12, 0 ; b3 + paddw m15, m13 + PMULHRSW_8192 m14, m14, [base+pw_2] + PMULHRSW_8192 m15, m15, [base+pw_2] +%endif + movu [tmpq+wq*0], m14 + movu [tmpq+wq*2], m15 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jg .v_w8_loop + add r5, 8 + add r8, 16 + movzx hd, r6b + mov srcq, r5 + mov tmpq, r8 + sub r6d, 1<<8 + jg .v_w8_loop0 + RET +%endif ;ARCH_X86_64 +%undef subpel0 +%undef subpel1 +%undef subpel2 +%undef subpel3 +.hv: + %assign stack_offset org_stack_offset + cmp wd, 4 + jg .hv_w8 + and mxd, 0x7f + movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] +%if ARCH_X86_32 + mov mxd, myd + shr myd, 16 + and mxd, 0x7f + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + mov strideq, stridem + %assign regs_used 6 + ALLOC_STACK -mmsize*14 + %assign regs_used 7 + lea r5, [strideq*3+1] + sub srcq, r5 + %define subpelv0 [rsp+mmsize*0] + %define subpelv1 [rsp+mmsize*1] + %define subpelv2 [rsp+mmsize*2] + %define subpelv3 [rsp+mmsize*3] + punpcklbw m0, m0 + psraw m0, 8 + pshufd m6, m0, q0000 + mova subpelv0, m6 + pshufd m6, m0, q1111 + mova subpelv1, m6 + pshufd m6, m0, q2222 + mova subpelv2, m6 + pshufd m6, m0, q3333 + mova subpelv3, m6 +%else + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + %if cpuflag(ssse3) + ALLOC_STACK mmsize*14, 14 + %else + ALLOC_STACK mmsize*14, 16 + %endif + lea stride3q, [strideq*3] + sub srcq, stride3q + dec srcq + %define subpelv0 m10 + %define subpelv1 m11 + %define subpelv2 m12 + %define subpelv3 m13 + punpcklbw m0, m0 + psraw m0, 8 + %if cpuflag(ssse3) + mova m8, [base+pw_8192] + %else + mova m8, [base+pw_2] + %endif + mova m9, [base+pd_32] + pshufd m10, m0, q0000 + pshufd m11, m0, q1111 + pshufd m12, m0, q2222 + pshufd m13, m0, q3333 +%endif + pshufd m7, m1, q0000 +%if notcpuflag(ssse3) + punpcklbw m7, m7 + psraw m7, 8 +%endif +%define hv4_line_0_0 4 +%define hv4_line_0_1 5 +%define hv4_line_0_2 6 +%define hv4_line_0_3 7 +%define hv4_line_0_4 8 +%define hv4_line_0_5 9 +%define hv4_line_1_0 10 +%define hv4_line_1_1 11 +%define hv4_line_1_2 12 +%define hv4_line_1_3 13 +%if ARCH_X86_32 + %if cpuflag(ssse3) + %define w8192reg [base+pw_8192] + %else + %define w8192reg [base+pw_2] + %endif + %define d32reg [base+pd_32] +%else + %define w8192reg m8 + %define d32reg m9 +%endif + ; lower shuffle 0 1 2 3 4 +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4] +%else + %if ARCH_X86_64 + mova m15, [pw_1] + %else + %define m15 m1 + %endif +%endif + movq m5, [srcq+strideq*0] ; 0 _ _ _ + movhps m5, [srcq+strideq*1] ; 0 _ 1 _ +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movq m4, [srcq+strideq*0] ; 2 _ _ _ + movhps m4, [srcq+strideq*1] ; 2 _ 3 _ + lea srcq, [srcq+strideq*2] +%else + movq m4, [srcq+strideq*2] ; 2 _ _ _ + movhps m4, [srcq+stride3q ] ; 2 _ 3 _ + lea srcq, [srcq+strideq*4] +%endif + PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~ + PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ + PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters + PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 + PMULHRSW_8192 m2, m2, w8192reg + SAVELINE_W4 m2, 2, 0 + ; upper shuffle 2 3 4 5 6 +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4+16] +%endif + PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~ + PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ + PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters + PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 + PMULHRSW_8192 m2, m2, w8192reg +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m2 + %else + mova [esp+mmsize*4], m2 + %endif +%endif + ; lower shuffle +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4] +%endif + movq m5, [srcq+strideq*0] ; 4 _ _ _ + movhps m5, [srcq+strideq*1] ; 4 _ 5 _ +%if ARCH_X86_32 + lea srcq, [srcq+strideq*2] + movq m4, [srcq+strideq*0] ; 6 _ _ _ + add srcq, strideq +%else + movq m4, [srcq+strideq*2] ; 6 _ _ _ + add srcq, stride3q +%endif + PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ + PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ + PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters + PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 + PMULHRSW_8192 m3, m3, w8192reg + SAVELINE_W4 m3, 3, 0 + ; upper shuffle +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4+16] +%endif + PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ + PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ + PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters + PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 + PMULHRSW_8192 m3, m3, w8192reg +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m2, m14 + %else + mova m2, [esp+mmsize*4] + %endif +%endif + ;process high + PALIGNR m4, m3, m2, 4;V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 + punpckhwd m2, m4 ; V 23 34 + pshufd m0, m3, q2121;V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 + SAVELINE_W4 m0, 0, 1 + SAVELINE_W4 m1, 1, 1 + SAVELINE_W4 m2, 2, 1 + SAVELINE_W4 m3, 3, 1 + ;process low + RESTORELINE_W4 m2, 2, 0 + RESTORELINE_W4 m3, 3, 0 + PALIGNR m4, m3, m2, 4;V 1 2 3 4 + punpcklwd m1, m2, m4 ; V 01 12 + punpckhwd m2, m4 ; V 23 34 + pshufd m0, m3, q2121;V 5 6 5 6 + punpcklwd m3, m0 ; V 45 56 +.hv_w4_loop: + ;process low + pmaddwd m5, m1, subpelv0 ; V a0 b0 + mova m1, m2 + pmaddwd m2, subpelv1; V a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, subpelv2; V a2 b2 + paddd m5, m3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m5 + %else + mova [esp+mmsize*4], m5 + %define m15 m3 + %endif +%endif +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4] +%endif + movq m4, [srcq+strideq*0] ; 7 + movhps m4, [srcq+strideq*1] ; 7 _ 8 _ + PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ + PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters + PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 + PMULHRSW_8192 m4, m4, w8192reg + PALIGNR m3, m4, m0, 12, m5 ; 6787 + mova m0, m4 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m3, subpelv3; a3 b3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m5, m14 + %else + mova m5, [esp+mmsize*4] + %endif +%endif + paddd m5, d32reg ; pd_32 + paddd m5, m4 + psrad m5, 6 + SAVELINE_W4 m0, 0, 0 + SAVELINE_W4 m1, 1, 0 + SAVELINE_W4 m2, 2, 0 + SAVELINE_W4 m3, 3, 0 + SAVELINE_W4 m5, 5, 0 + ;process high + RESTORELINE_W4 m0, 0, 1 + RESTORELINE_W4 m1, 1, 1 + RESTORELINE_W4 m2, 2, 1 + RESTORELINE_W4 m3, 3, 1 + pmaddwd m5, m1, subpelv0; V a0 b0 + mova m1, m2 + pmaddwd m2, subpelv1; V a1 b1 + paddd m5, m2 + mova m2, m3 + pmaddwd m3, subpelv2; V a2 b2 + paddd m5, m3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m5 + %else + mova [esp+0xA0], m5 + %endif +%endif +%if cpuflag(ssse3) + mova m6, [base+subpel_h_shuf4+16] +%endif + movq m4, [srcq+strideq*0] ; 7 + movhps m4, [srcq+strideq*1] ; 7 _ 8 _ + PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ + PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters + PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 + PMULHRSW_8192 m4, m4, w8192reg + PALIGNR m3, m4, m0, 12, m5 ; 6787 + mova m0, m4 + punpcklwd m3, m4 ; 67 78 + pmaddwd m4, m3, subpelv3; a3 b3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m5, m14 + %else + mova m5, [esp+0xA0] + %endif +%endif + paddd m5, d32reg ; pd_32 + paddd m5, m4 + psrad m4, m5, 6 + RESTORELINE_W4 m5, 5, 0 + packssdw m5, m4 + pshufd m5, m5, q3120 + movu [tmpq], m5 + lea srcq, [srcq+strideq*2] + add tmpq, 16 + sub hd, 2 + SAVELINE_W4 m0, 0, 1 + SAVELINE_W4 m1, 1, 1 + SAVELINE_W4 m2, 2, 1 + SAVELINE_W4 m3, 3, 1 + RESTORELINE_W4 m0, 0, 0 + RESTORELINE_W4 m1, 1, 0 + RESTORELINE_W4 m2, 2, 0 + RESTORELINE_W4 m3, 3, 0 + jg .hv_w4_loop + RET +%undef subpelv0 +%undef subpelv1 +%undef subpelv2 +%undef subpelv3 +.hv_w8: + %assign stack_offset org_stack_offset +%define hv8_line_1 0 +%define hv8_line_2 1 +%define hv8_line_3 2 +%define hv8_line_4 3 +%define hv8_line_6 4 + shr mxd, 16 +%if ARCH_X86_32 + %define subpelh0 [rsp+mmsize*5] + %define subpelh1 [rsp+mmsize*6] + %define subpelv0 [rsp+mmsize*7] + %define subpelv1 [rsp+mmsize*8] + %define subpelv2 [rsp+mmsize*9] + %define subpelv3 [rsp+mmsize*10] + %define accuv0 [rsp+mmsize*11] + %define accuv1 [rsp+mmsize*12] + movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] + mov mxd, myd + shr myd, 16 + and mxd, 0x7f + cmp hd, 6 + cmovs myd, mxd + movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + mov strideq, stridem + %assign regs_used 6 + ALLOC_STACK -mmsize*14 + %assign regs_used 7 + %if STACK_ALIGNMENT < mmsize + %define tmpm [rsp+mmsize*13+gprsize*1] + %define srcm [rsp+mmsize*13+gprsize*2] + %define stridem [rsp+mmsize*13+gprsize*3] + mov tmpm, tmpq + mov stridem, strideq + %endif + %if cpuflag(ssse3) + pshufd m0, m1, q0000 + pshufd m1, m1, q1111 + %else + punpcklbw m1, m1 + psraw m1, 8 + pshufd m0, m1, q1010 + punpckhqdq m1, m1 + %endif + punpcklbw m5, m5 + psraw m5, 8 + pshufd m2, m5, q0000 + pshufd m3, m5, q1111 + pshufd m4, m5, q2222 + pshufd m5, m5, q3333 + mova subpelh0, m0 + mova subpelh1, m1 + mova subpelv0, m2 + mova subpelv1, m3 + mova subpelv2, m4 + mova subpelv3, m5 + lea r5, [strideq*3+3] + sub srcq, r5 + mov srcm, srcq +%else + ALLOC_STACK mmsize*5, 16 + %define subpelh0 m10 + %define subpelh1 m11 + %define subpelv0 m12 + %define subpelv1 m13 + %define subpelv2 m14 + %define subpelv3 m15 + %define accuv0 m8 + %define accuv1 m9 + movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] + movzx mxd, myb + shr myd, 16 + cmp hd, 6 + cmovs myd, mxd + movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + %if cpuflag(ssse3) + pshufd subpelh0, m0, q0000 + pshufd subpelh1, m0, q1111 + %else + punpcklbw m0, m0 + psraw m0, 8 + pshufd subpelh0, m0, q1010 + pshufd subpelh1, m0, q3232 + mova m7, [base+pw_2] + %endif + punpcklbw m1, m1 + psraw m1, 8 + pshufd subpelv0, m1, q0000 + pshufd subpelv1, m1, q1111 + pshufd subpelv2, m1, q2222 + pshufd subpelv3, m1, q3333 + lea stride3q, [strideq*3] + sub srcq, 3 + sub srcq, stride3q + mov r6, srcq + mov r8, tmpq +%endif + lea r5d, [wq-4] + shl r5d, 14 + add r5d, hd +.hv_w8_loop0: +%if cpuflag(ssse3) + %if ARCH_X86_64 + mova m7, [base+subpel_h_shufA] + mova m8, [base+subpel_h_shufB] + mova m9, [base+subpel_h_shufC] + %define shufA m7 + %define shufB m8 + %define shufC m9 + %else + %define shufA [base+subpel_h_shufA] + %define shufB [base+subpel_h_shufB] + %define shufC [base+subpel_h_shufC] + %endif +%endif + PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 + PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 +%if ARCH_X86_64 + PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 + add srcq, stride3q + PREP_8TAP_HV m0, srcq+strideq*0, m7, m9 +%else + lea srcq, [srcq+strideq*2] + %if notcpuflag(ssse3) + mova [esp], m4 + %endif + PREP_8TAP_HV m6, srcq+strideq*0, m7, m4 + PREP_8TAP_HV m0, srcq+strideq*1, m7, m4 + lea srcq, [srcq+strideq*2] +%endif +%if cpuflag(ssse3) + mova m7, [base+pw_8192] +%else + mova m7, [base+pw_2] + %if ARCH_X86_32 + mova m4, [esp] + %endif +%endif + PMULHRSW_8192 m4, m4, m7 + PMULHRSW_8192 m5, m5, m7 + PMULHRSW_8192 m6, m6, m7 + PMULHRSW_8192 m0, m0, m7 + punpcklwd m1, m4, m5 ; 01 + punpcklwd m2, m5, m6 ; 12 + punpcklwd m3, m6, m0 ; 23 + SAVELINE_W8 1, m1 + SAVELINE_W8 2, m2 + SAVELINE_W8 3, m3 +%if cpuflag(ssse3) + mova m7, [base+subpel_h_shufA] +%endif +%if ARCH_X86_64 + PREP_8TAP_HV m4, srcq+strideq*1, m8, m9 + PREP_8TAP_HV m5, srcq+strideq*2, m8, m9 + add srcq, stride3q + PREP_8TAP_HV m6, srcq+strideq*0, m8, m9 +%else + %if notcpuflag(ssse3) + mova [esp+0x30], m0 + %endif + PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 + PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 + lea srcq, [srcq+strideq*2] + PREP_8TAP_HV m6, srcq+strideq*0, m7, m0 +%endif +%if cpuflag(ssse3) + mova m7, [base+pw_8192] +%elif ARCH_X86_32 + mova m0, [esp+0x30] + mova m7, [base+pw_2] +%endif + PMULHRSW_8192 m1, m4, m7 + PMULHRSW_8192 m2, m5, m7 + PMULHRSW_8192 m3, m6, m7 + punpcklwd m4, m0, m1 ; 34 + punpcklwd m5, m1, m2 ; 45 + punpcklwd m6, m2, m3 ; 56 + SAVELINE_W8 6, m3 + RESTORELINE_W8 1, m1 + RESTORELINE_W8 2, m2 + RESTORELINE_W8 3, m3 +.hv_w8_loop: + SAVELINE_W8 1, m3 + SAVELINE_W8 2, m4 + SAVELINE_W8 3, m5 + SAVELINE_W8 4, m6 +%if ARCH_X86_32 + pmaddwd m0, m1, subpelv0 ; a0 + pmaddwd m7, m2, subpelv0 ; b0 + pmaddwd m3, subpelv1 ; a1 + pmaddwd m4, subpelv1 ; b1 + paddd m0, m3 + paddd m7, m4 + pmaddwd m5, subpelv2 ; a2 + pmaddwd m6, subpelv2 ; b2 + paddd m0, m5 + paddd m7, m6 + mova m5, [base+pd_32] + paddd m0, m5 + paddd m7, m5 + mova accuv0, m0 + mova accuv1, m7 +%else + pmaddwd accuv0, m1, subpelv0 ; a0 + pmaddwd accuv1, m2, subpelv0 ; b0 + pmaddwd m3, subpelv1 ; a1 + pmaddwd m4, subpelv1 ; b1 + paddd accuv0, m3 + paddd accuv1, m4 + pmaddwd m5, subpelv2 ; a2 + pmaddwd m6, subpelv2 ; b2 + paddd accuv0, m5 + paddd accuv1, m6 + mova m7, [base+pd_32] + paddd accuv0, m7 + paddd accuv1, m7 + %if cpuflag(ssse3) + mova m7, [base+subpel_h_shufB] + mova m6, [base+subpel_h_shufC] + mova m5, [base+subpel_h_shufA] + %define shufA m5 + %define shufB m7 + %define shufC m6 + %endif +%endif + PREP_8TAP_HV m0, srcq+strideq*1, m5, m6 + lea srcq, [srcq+strideq*2] + PREP_8TAP_HV m4, srcq+strideq*0, m5, m6 +%if cpuflag(ssse3) + mova m5, [base+pw_8192] +%else + mova m5, [base+pw_2] +%endif + PMULHRSW_8192 m0, m0, m5 + PMULHRSW_8192 m4, m4, m5 + RESTORELINE_W8 6, m6 + punpcklwd m5, m6, m0 ; 67 + punpcklwd m6, m0, m4 ; 78 + pmaddwd m1, m5, subpelv3 ; a3 + paddd m2, m1, accuv0 + pmaddwd m1, m6, subpelv3 ; b3 + paddd m1, m1, accuv1 + psrad m2, 6 + psrad m1, 6 + packssdw m2, m1 + movq [tmpq+wq*0], m2 + movhps [tmpq+wq*2], m2 + lea tmpq, [tmpq+wq*4] + sub hd, 2 + jle .hv_w8_outer + SAVELINE_W8 6, m4 + RESTORELINE_W8 1, m1 + RESTORELINE_W8 2, m2 + RESTORELINE_W8 3, m3 + RESTORELINE_W8 4, m4 + jmp .hv_w8_loop +.hv_w8_outer: +%if ARCH_X86_32 + mov srcq, srcm + mov tmpq, tmpm + movzx hd, r5w + add srcq, 4 + add tmpq, 8 + mov srcm, srcq + mov tmpm, tmpq +%else + add r6, 4 + add r8, 8 + movzx hd, r5b + mov srcq, r6 + mov tmpq, r8 +%endif + sub r5d, 1<<16 + jg .hv_w8_loop0 + RET +%endmacro + +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro SAVE_REG 1 + %xdefine r%1_save r%1 + %xdefine r%1q_save r%1q + %xdefine r%1d_save r%1d + %if ARCH_X86_32 + %define r%1m_save [rstk+stack_offset+(%1+1)*4] + %endif +%endmacro + +%macro LOAD_REG 1 + %xdefine r%1 r%1_save + %xdefine r%1q r%1q_save + %xdefine r%1d r%1d_save + %if ARCH_X86_32 + %define r%1m r%1m_save + %endif + %undef r%1d_save + %undef r%1q_save + %undef r%1_save +%endmacro + +%macro REMAP_REG 2-3 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d + %if ARCH_X86_32 + %if %3 == 0 + %xdefine r%1m r%2m + %else + %define r%1m [rstk+stack_offset+(%1+1)*4] + %endif + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %if ARCH_X86_64 + SAVE_REG 14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %else + SAVE_REG 5 + %assign %%i 5 + %rep 5 + %assign %%j %%i-1 + REMAP_REG %%i, %%j, 0 + %assign %%i %%i-1 + %endrep + %endif + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %if ARCH_X86_64 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + LOAD_REG 14 + %else + %rep 4 + %assign %%j %%i+1 + REMAP_REG %%i, %%j, 1 + %assign %%i %%i+1 + %endrep + LOAD_REG 5 + %endif + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%if ARCH_X86_64 + %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3] + SWAP m%2, m%5 + movq m%1, [srcq+ r4] + movq m%2, [srcq+ r6] + movhps m%1, [srcq+ r7] + movhps m%2, [srcq+ r9] + movq m%3, [srcq+r10] + movq m%4, [srcq+r11] + movhps m%3, [srcq+r13] + movhps m%4, [srcq+ rX] + add srcq, ssq + movq m%5, [srcq+ r4] + movq m%6, [srcq+ r6] + movhps m%5, [srcq+ r7] + movhps m%6, [srcq+ r9] + movq m%7, [srcq+r10] + movq m%8, [srcq+r11] + movhps m%7, [srcq+r13] + movhps m%8, [srcq+ rX] + add srcq, ssq + pmaddubsw m%1, m%9 + pmaddubsw m%5, m%9 + pmaddubsw m%2, m%10 + pmaddubsw m%6, m%10 + pmaddubsw m%3, m%11 + pmaddubsw m%7, m%11 + pmaddubsw m%4, m%12 + pmaddubsw m%8, m%12 + phaddw m%1, m%2 + phaddw m%5, m%6 + phaddw m%3, m%4 + phaddw m%7, m%8 + phaddw m%1, m%3 + phaddw m%5, m%7 + pmulhrsw m%1, m12 + pmulhrsw m%5, m12 + SWAP m%2, m%5 + %endmacro +%else + %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets + %if %3 == 1 + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + %endif + movq m0, [srcq+r0] + movq m1, [srcq+rX] + movhps m0, [srcq+r4] + movhps m1, [srcq+r5] + add srcq, ssq + movq m4, [srcq+r0] + movq m5, [srcq+rX] + movhps m4, [srcq+r4] + movhps m5, [srcq+r5] + mov r0, [esp+16] + mov rX, [esp+24] + mov r4, [esp+20] + mov r5, [esp+28] + sub srcq, ssq + movq m2, [srcq+r0] + movq m3, [srcq+rX] + movhps m2, [srcq+r4] + movhps m3, [srcq+r5] + add srcq, ssq + movq m6, [srcq+r0] + movq m7, [srcq+rX] + movhps m6, [srcq+r4] + movhps m7, [srcq+r5] + add srcq, ssq + pmaddubsw m0, [esp+%1+ 0] + pmaddubsw m4, [esp+%1+ 0] + pmaddubsw m1, [esp+%1+16] + pmaddubsw m5, [esp+%1+16] + pmaddubsw m2, [esp+%1+32] + pmaddubsw m6, [esp+%1+32] + pmaddubsw m3, [esp+%1+48] + pmaddubsw m7, [esp+%1+48] + phaddw m0, m1 + phaddw m4, m5 + phaddw m2, m3 + phaddw m6, m7 + phaddw m0, m2 + phaddw m4, m6 + pmulhrsw m0, m12 + pmulhrsw m4, m12 + %if %2 != 0 + mova [esp+%2+ 0], m0 + mova [esp+%2+16], m4 + %endif + %endmacro +%endif + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isprep 0 + %if ARCH_X86_64 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %else ; ARCH_X86_32 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %endif + %xdefine base_reg r12 + %define rndshift 10 +%else ; prep + %assign isprep 1 + %if ARCH_X86_64 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+0x138] + %endif + %xdefine base_reg r11 + %else ; ARCH_X86_32 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy + %else +cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy + %endif + %define tmp_stridem dword [esp+0x138] + %endif + %define rndshift 6 +%endif +%if ARCH_X86_32 + mov [esp+0x1f0], t0d + mov [esp+0x1f4], t1d + %if !isprep && required_stack_alignment > STACK_ALIGNMENT + mov dstd, dstm + mov dsd, dsm + mov srcd, srcm + mov ssd, ssm + mov hd, hm + mov r4, mxm + %define r0m [esp+0x200] + %define dsm [esp+0x204] + %define dsmp dsm + %define r1m dsm + %define r2m [esp+0x208] + %define ssm [esp+0x20c] + %define r3m ssm + %define hm [esp+0x210] + %define mxm [esp+0x214] + mov r0m, dstd + mov dsm, dsd + mov r2m, srcd + mov ssm, ssd + mov hm, hd + mov r0, mym + mov r1, dxm + mov r2, dym + %define mym [esp+0x218] + %define dxm [esp+0x09c] + %define dym [esp+0x21c] + mov mxm, r4 + mov mym, r0 + mov dxm, r1 + mov dym, r2 + tzcnt wd, wm + %endif + %if isprep && required_stack_alignment > STACK_ALIGNMENT + %xdefine base_reg r5 + %else + %xdefine base_reg r6 + %endif + mov ssd, ssm +%endif + LEA base_reg, %1_8tap_scaled_8bpc_ssse3 +%xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3 +%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT + tzcnt wd, wm +%endif +%if ARCH_X86_32 + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 +%endif + movd m8, dxm + movd m14, mxm + pshufd m8, m8, q0000 + pshufd m14, m14, q0000 +%if isprep && UNIX64 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 +%endif +%if ARCH_X86_64 + mov dyd, dym +%endif +%ifidn %1, put + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %elif ARCH_X86_64 + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %endif + %if ARCH_X86_64 + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+0x138] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif + %else + %define rX r1 + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %elif ARCH_X86_64 + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %define hm [rsp+0x94] + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %if ARCH_X86_64 + %define rX r14 + %define rXd r14d + %else + %define rX r3 + %endif +%endif +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] + mova m12, [base+pw_8192] + %ifidn %1, put + mova m13, [base+pd_512] + %else + mova m13, [base+pd_32] + %endif +%else + %define m10 [base+pd_0x3ff] + %define m12 [base+pw_8192] + %ifidn %1, put + %define m13 [base+pd_512] + %else + %define m13 [base+pd_32] + %endif +%endif + pxor m9, m9 +%if ARCH_X86_64 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q +%else + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + mov r1, [esp+0x1f4] + lea r0, [ssq*3] + movzx r2, r1b + shr r1, 16 + cmp dword hm, 6 + cmovs r1, r2 + mov [esp+0x1f4], r1 + mov r1, r1m + mov r2, r2m + sub srcq, r0 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define ss3q r0 + %define myd r4 + %define dyd dword dym + %define hd dword hm +%endif + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.w2: + %if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d + %else + movzx r4, byte [esp+0x1f0] + dec srcq + movd m15, r4 + %endif + punpckldq m9, m8 + SWAP m8, m9 + paddd m14, m8 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m11, [base+pd_0x4000] + %else + %define m11 [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + psrldq m15, 4 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_dw] + mova m6, [base+subpel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m9, m9 + pcmpeqd m8, m9 + psrld m14, 10 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [rsp+0x180], m14 + SWAP m5, m0 + SWAP m6, m3 + %define m8 m5 + %define m15 m6 + %endif + movq m0, [srcq+ssq*0] + movq m2, [srcq+ssq*2] + movhps m0, [srcq+ssq*1] + movhps m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + %if ARCH_X86_64 + pshufb m14, m5 + paddb m14, m6 + %endif + movq m1, [srcq+ssq*0] + movq m3, [srcq+ssq*2] + movhps m1, [srcq+ssq*1] + movhps m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpckldq m15, m7 + punpcklqdq m15, m15 + %if ARCH_X86_64 + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + %else + pand m7, m8, m11 + pandn m8, m15 + %define m8 m6 + %define m15 m5 + por m15, m7 + mova [rsp+0x190], m15 + %endif + pshufb m0, m14 + pshufb m2, m14 + pshufb m1, m14 + pshufb m3, m14 + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 1 2 3 + pmulhrsw m1, m12 ; 4 5 6 7 + palignr m2, m1, m0, 4 ; 1 2 3 4 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + pshufd m5, m1, q0321 ; 5 6 7 _ + punpcklwd m2, m1, m5 ; 45 56 + punpckhwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mov myd, mym + mov r0, r0m + mova [rsp+0x1a0], m3 + mova [rsp+0x1b0], m0 + mova [rsp+0x1c0], m2 + mova [rsp+0x1d0], m4 + %endif +.w2_loop: + and myd, 0x3ff + %if ARCH_X86_64 + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m11, r6q + punpcklbw m11, m11 + psraw m11, 8 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m5, m3, m8 + pmaddwd m6, m0, m9 + pmaddwd m7, m2, m10 + pmaddwd m8, m4, m11 + paddd m5, m6 + paddd m7, m8 + %else + mov mym, myd + mov r1, [esp+0x1f4] + xor r3, r3 + shr r4, 6 + lea r1, [r1+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r1*8+0] + cmovnz r3, [base+subpel_filters+r1*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m5, m7, q0000 + pshufd m6, m7, q1111 + pmaddwd m3, m5 + pmaddwd m0, m6 + pshufd m5, m7, q2222 + pshufd m7, m7, q3333 + pmaddwd m2, m5 + pmaddwd m4, m7 + paddd m3, m0 + paddd m2, m4 + SWAP m5, m3 + SWAP m7, m2 + %endif + paddd m5, m13 + paddd m5, m7 + psrad m5, 10 + packssdw m5, m5 + packuswb m5, m5 + %if ARCH_X86_64 + pextrw r6d, m5, 0 + mov [dstq], r6w + add dstq, dsq + dec hd + jz .ret + add myd, dyd + %else + pextrw r3d, m5, 0 + mov [dstq], r3w + add dstq, dsm + dec hd + jz .ret + mov myd, mym + add myd, dym + %endif + test myd, ~0x3ff + %if ARCH_X86_32 + SWAP m3, m5 + SWAP m2, m7 + mova m3, [rsp+0x1a0] + mova m0, [rsp+0x1b0] + mova m2, [rsp+0x1c0] + mova m4, [rsp+0x1d0] + %define m14 [esp+0x180] + %define m15 [esp+0x190] + %endif + jz .w2_loop + %if ARCH_X86_32 + mov r3, r3m + %endif + movq m5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps m3, m0, q1032 ; 01 12 + shufps m0, m2, q1032 ; 23 34 + shufps m2, m4, q1032 ; 45 56 + pshufb m5, m14 + pmaddubsw m5, m15 + phaddw m5, m5 + pmulhrsw m5, m12 + palignr m4, m5, m1, 12 + punpcklqdq m1, m4, m4 ; 6 7 6 7 + punpcklwd m4, m1, m5 ; 67 __ + %if ARCH_X86_32 + mova [rsp+0x1a0], m3 + mova [rsp+0x1b0], m0 + mova [rsp+0x1c0], m2 + mova [rsp+0x1d0], m4 + %endif + jmp .w2_loop +.w2_skip_line: + movhps m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m3, m0 ; 01 12 + mova m0, m2 ; 23 34 + pshufb m5, m14 + pmaddubsw m5, m15 + phaddw m5, m5 + pmulhrsw m5, m12 ; 6 7 6 7 + palignr m4, m5, m1, 8 ; 4 5 6 7 + pshufd m5, m4, q0321 ; 5 6 7 _ + mova m1, m4 + punpcklwd m2, m4, m5 ; 45 56 + punpckhwd m4, m5 ; 67 __ + %if ARCH_X86_32 + mova [rsp+0x1a0], m3 + mova [rsp+0x1b0], m0 + mova [rsp+0x1c0], m2 + mova [rsp+0x1d0], m4 + %endif + jmp .w2_loop +%endif +INIT_XMM ssse3 +.w4: +%if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d +%else + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + movzx r4, byte [esp+0x1f0] + dec srcq + movd m15, r4 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m11, [base+pd_0x4000] +%else + %define m11 [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd m15, m0 + psrldq m7, m15, 8 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r6d, m15 + movd r13d, m7 + movd m15, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+r11*8+2] + movd m3, [base+subpel_filters+ r6*8+2] + movd m4, [base+subpel_filters+r13*8+2] +%else + movd r0, m15 + movd rX, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r4, m15 + movd r5, m7 + movd m1, [base+subpel_filters+r0*8+2] + movd m2, [base+subpel_filters+rX*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m7, [base+subpel_filters+r5*8+2] + movifprep r3, r3m + SWAP m4, m7 + %define m15 m1 +%endif + mova m5, [base+bdct_lb_dw] + movq m6, [base+subpel_s_shuf2] + psrld m14, 10 + punpckldq m15, m3 + punpckldq m2, m4 + punpcklqdq m15, m2 + punpcklqdq m6, m6 + pshufb m14, m5 + paddb m14, m6 +%if ARCH_X86_64 + pcmpeqd m0, m9 + pand m11, m0 +%else + mova [esp+0x180], m14 + SWAP m7, m4 + pxor m3, m3 + pcmpeqd m0, m3 + pand m2, m11, m0 + %define m11 m2 +%endif + pandn m0, m15 +%if ARCH_X86_64 + SWAP m15, m0 +%else + %define m15 m0 +%endif + por m15, m11 +%if ARCH_X86_64 + movu m7, [srcq+ssq*0] + movu m9, [srcq+ssq*1] + movu m8, [srcq+ssq*2] + movu m10, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + movu m2, [srcq+ssq*0] + movu m4, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m5, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m7, m14 + pshufb m9, m14 + pshufb m8, m14 + pshufb m10, m14 + pshufb m2, m14 + pshufb m4, m14 + pshufb m3, m14 + pshufb m5, m14 + pmaddubsw m7, m15 + pmaddubsw m9, m15 + pmaddubsw m8, m15 + pmaddubsw m10, m15 + pmaddubsw m2, m15 + pmaddubsw m4, m15 + pmaddubsw m3, m15 + pmaddubsw m5, m15 + phaddw m7, m9 + phaddw m8, m10 + phaddw m9, m2, m4 + phaddw m3, m5 + pmulhrsw m7, m12 ; 0 1 + pmulhrsw m8, m12 ; 2 3 + pmulhrsw m9, m12 ; 4 5 + pmulhrsw m3, m12 ; 6 7 + shufps m4, m7, m8, q1032 ; 1 2 + shufps m5, m8, m9, q1032 ; 3 4 + shufps m6, m9, m3, q1032 ; 5 6 + psrldq m11, m3, 8 ; 7 _ + punpcklwd m0, m7, m4 ; 01 + punpckhwd m7, m4 ; 12 + punpcklwd m1, m8, m5 ; 23 + punpckhwd m8, m5 ; 34 + punpcklwd m2, m9, m6 ; 45 + punpckhwd m9, m6 ; 56 + punpcklwd m3, m11 ; 67 + mova [rsp+0x00], m7 + mova [rsp+0x10], m8 + mova [rsp+0x20], m9 +%else + mova [esp+0x190], m15 + lea ss3q, [ssq*3] + movu m2, [srcq+ssq*0] + movu m3, [srcq+ssq*1] + movu m7, [srcq+ssq*2] + movu m6, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m2, m14 + pshufb m3, m14 + pshufb m7, m14 + pshufb m6, m14 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m7, m15 + pmaddubsw m6, m15 + phaddw m2, m3 + phaddw m7, m6 + movu m1, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m6, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m1, m14 + pshufb m5, m14 + pshufb m3, m14 + pshufb m6, m14 + pmaddubsw m1, m15 + pmaddubsw m5, m15 + pmaddubsw m3, m15 + pmaddubsw m6, m15 + phaddw m1, m5 + phaddw m3, m6 + pmulhrsw m2, m12 + pmulhrsw m7, m12 + pmulhrsw m1, m12 + pmulhrsw m3, m12 + shufps m4, m2, m7, q1032 ; 1 2 + shufps m5, m7, m1, q1032 ; 3 4 + shufps m6, m1, m3, q1032 ; 5 6 + psrldq m0, m3, 8 ; 7 _ + mova [esp+0x1a0], m0 + %define m11 [esp+0x1a0] + punpcklwd m0, m2, m4 ; 01 + punpckhwd m2, m4 ; 12 + punpcklwd m4, m7, m5 ; 23 + punpckhwd m7, m5 ; 34 + punpcklwd m5, m1, m6 ; 45 + punpckhwd m1, m6 ; 56 + punpcklwd m3, [esp+0x1a0] ; 67 + mov myd, mym + mov r0, r0m + mova [esp+0x1b0], m0 ; 01 + mova [esp+0x1c0], m4 ; 23 + mova [esp+0x1d0], m5 ; 45 + mova [esp+0x1e0], m3 ; 67 + mova [rsp+0x00], m2 ; 12 + mova [rsp+0x10], m7 ; 34 + mova [rsp+0x20], m1 ; 56 + SWAP m1, m4 + SWAP m2, m5 +%endif +.w4_loop: + and myd, 0x3ff +%if ARCH_X86_64 + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m10, r6q + punpcklbw m10, m10 + psraw m10, 8 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + pmaddwd m7, m3, m10 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 +%else + mov mym, myd + mov r5, [esp+0x1f4] + xor r3, r3 + shr r4, 6 + lea r5, [r5+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + pmaddwd m0, m4 + pmaddwd m1, m5 + pmaddwd m2, m6 + pmaddwd m3, m7 + paddd m0, m1 + paddd m2, m3 + paddd m0, m13 + paddd m0, m2 + SWAP m4, m0 +%endif + psrad m4, rndshift + packssdw m4, m4 +%ifidn %1, put + packuswb m4, m4 + movd [dstq], m4 + add dstq, dsmp +%else + movq [tmpq], m4 + add tmpq, 8 +%endif + dec hd + jz .ret +%if ARCH_X86_64 + add myd, dyd + test myd, ~0x3ff + jz .w4_loop +%else + SWAP m0, m4 + mov myd, mym + mov r3, r3m + add myd, dym + test myd, ~0x3ff + jnz .w4_next_line + mova m0, [esp+0x1b0] + mova m1, [esp+0x1c0] + mova m2, [esp+0x1d0] + mova m3, [esp+0x1e0] + jmp .w4_loop +.w4_next_line: + %define m14 [esp+0x180] + %define m15 [esp+0x190] +%endif + movu m4, [srcq] + test myd, 0x400 + jz .w4_skip_line +%if ARCH_X86_64 + mova m0, [rsp+0x00] + mova [rsp+0x00], m1 + mova m1, [rsp+0x10] + mova [rsp+0x10], m2 + mova m2, [rsp+0x20] + mova [rsp+0x20], m3 +%else + mova m5, [esp+0x1c0] + mova m0, [rsp+0x000] + mova [rsp+0x00], m5 + mova [esp+0x1b0], m0 + mova m6, [esp+0x1d0] + mova m1, [rsp+0x010] + mova [rsp+0x10], m6 + mova [esp+0x1c0], m1 + mova m7, [esp+0x1e0] + mova m2, [rsp+0x020] + mova [rsp+0x20], m7 + mova [esp+0x1d0], m2 +%endif + pshufb m4, m14 + pmaddubsw m4, m15 + phaddw m4, m4 + pmulhrsw m4, m12 + punpcklwd m3, m11, m4 +%if ARCH_X86_32 + mova [esp+0x1e0], m3 +%endif + mova m11, m4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: +%if ARCH_X86_32 + mova m0, [esp+0x1c0] + mova m1, [esp+0x1d0] + mova m2, [esp+0x1e0] +%endif + movu m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova m6, [rsp+0x10] + mova m7, [rsp+0x20] + pshufb m4, m14 + pshufb m5, m14 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m4, m5 + pmulhrsw m4, m12 + punpcklwd m5, m11, m4 + mova [rsp+0x00], m6 + mova [rsp+0x10], m7 + mova [rsp+0x20], m5 +%if ARCH_X86_64 + psrldq m11, m4, 8 + mova m0, m1 + mova m1, m2 + mova m2, m3 + punpcklwd m3, m4, m11 +%else + psrldq m6, m4, 8 + punpcklwd m3, m4, m6 + mova [esp+0x1a0], m6 + mova [esp+0x1b0], m0 + mova [esp+0x1c0], m1 + mova [esp+0x1d0], m2 + mova [esp+0x1e0], m3 +%endif + jmp .w4_loop +INIT_XMM ssse3 +.w8: + mov dword [rsp+0x90], 1 + movifprep tmp_stridem, 16 + jmp .w_start +.w16: + mov dword [rsp+0x90], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [rsp+0x90], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [rsp+0x90], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [rsp+0x90], 16 + movifprep tmp_stridem, 256 +.w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif +%if ARCH_X86_64 + shr t0d, 16 + movd m15, t0d +%else + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq ssm + %endif + mov r4, [esp+0x1f0] + shr r4, 16 + movd m15, r4 + mov r0, r0m + mov myd, mym +%endif + sub srcq, 3 + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + mova [rsp+0x100], m7 + mova [rsp+0x120], m15 + mov [rsp+0x098], srcq + mov [rsp+0x130], r0q ; dstq / tmpq +%if ARCH_X86_64 && UNIX64 + mov hm, hd +%elif ARCH_X86_32 + mov r5, hm + mov [esp+0x094], myd + mov [esp+0x134], r5 +%endif + jmp .hloop +.hloop_prep: + dec dword [rsp+0x090] + jz .ret +%if ARCH_X86_64 + add qword [rsp+0x130], 8*(isprep+1) + mov hd, hm +%else + add dword [esp+0x130], 8*(isprep+1) + mov myd, [esp+0x094] + mov r5, [esp+0x134] + mov r0, [esp+0x130] +%endif + mova m7, [rsp+0x100] + mova m14, [rsp+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] +%endif + mova m15, [rsp+0x120] + pxor m9, m9 + mov srcq, [rsp+0x098] +%if ARCH_X86_64 + mov r0q, [rsp+0x130] ; dstq / tmpq +%else + mov mym, myd + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.hloop: +%if ARCH_X86_64 + mova m11, [base+pq_0x40000000] +%else + %define m11 [base+pq_0x40000000] +%endif + psrld m2, m14, 10 + mova [rsp], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m9 + psrldq m2, m5, 8 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + psrldq m5, 4 + psrldq m2, 4 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + psrldq m5, 4 + psrldq m2, 4 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] + pxor m2, m2 + %define m9 m2 +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + mova [rsp+0x110], m14 + psrldq m4, m15, 8 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + psrldq m4, m14, 8 + movd r10d, m14 + movd r11d, m4 + psrldq m14, 4 + psrldq m4, 4 + movd r13d, m14 + movd rXd, m4 + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m14, m5, q1100 + pshufd m5, m5, q3322 + pand m7, m11, m4 + pand m8, m11, m6 + pand m15, m11, m14 + pand m11, m11, m5 + pandn m4, m0 + pandn m6, m1 + pandn m14, m2 + pandn m5, m3 + por m7, m4 + por m8, m6 + por m15, m14 + por m11, m5 + mova [rsp+0x10], m7 + mova [rsp+0x20], m8 + mova [rsp+0x30], m15 + mova [rsp+0x40], m11 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1 + mova [rsp+0x50], m1 + mova [rsp+0x60], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3 + mova [rsp+0x70], m3 + mova [rsp+0x80], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5 + MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7 + SWAP m7, m0 + SWAP m8, m14 + mova m1, [rsp+0x50] + mova m2, [rsp+0x60] + mova m3, [rsp+0x70] + mova m9, [rsp+0x80] + mov myd, mym + mov dyd, dym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m9 ; 23a + punpckhwd m3, m9 ; 23b + mova [rsp+0x50], m4 + mova [rsp+0x60], m5 + mova [rsp+0x70], m6 + mova [rsp+0x80], m7 + SWAP m14, m8 +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq m11, r6q + punpcklbw m11, m11 + psraw m11, 8 + pshufd m5, m11, q0000 + pshufd m7, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m4, m5, m0 + pmaddwd m5, m5, m1 + pmaddwd m6, m7, m2 + pmaddwd m7, m7, m3 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 + pmaddwd m6, [rsp+0x50], m10 + pmaddwd m7, [rsp+0x60], m10 + pmaddwd m8, [rsp+0x70], m11 + pmaddwd m9, [rsp+0x80], m11 + paddd m4, m6 + paddd m5, m7 + paddd m4, m8 + paddd m5, m9 +%else + movd r0, m15 + movd rX, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r4, m15 + movd r5, m4 + mova m14, [esp+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [esp+16], m14 + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m11, m4 + pand m1, m11, m6 + pand m2, m11, m7 + pand m3, m11, m5 + pandn m4, [esp+0x20] + pandn m6, [esp+0x30] + pandn m7, [esp+0x40] + pandn m5, [esp+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1 + MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3 + MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5 + MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7 + mova m5, [esp+0x180] + mova m6, [esp+0x190] + mova m7, [esp+0x1a0] + mova m0, [esp+0x1b0] + mov myd, mym + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [esp+0x180], m4 + mova [esp+0x190], m5 + mova [esp+0x1a0], m6 + mova [esp+0x1b0], m7 + mova m1, [esp+0x140] + mova m2, [esp+0x150] + mova m3, [esp+0x160] + mova m4, [esp+0x170] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova [esp+0x140], m0 + mova [esp+0x150], m1 + mova [esp+0x160], m2 + mova [esp+0x170], m3 +.vloop: + mov r0, r0m + mov r5, [esp+0x1f4] + and myd, 0x3ff + mov mym, myd + xor r3, r3 + shr r4, 6 + lea r5, [r5+r4] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + movd m7, r4 + movd m6, r3 + punpckldq m7, m6 + punpcklbw m7, m7 + psraw m7, 8 + pshufd m4, m7, q0000 + pshufd m5, m7, q1111 + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m5 + pmaddwd m3, m5 + pshufd m6, m7, q2222 + pshufd m7, m7, q3333 + paddd m0, m2 + paddd m1, m3 + pmaddwd m2, [esp+0x180], m6 + pmaddwd m3, [esp+0x190], m6 + pmaddwd m4, [esp+0x1a0], m7 + pmaddwd m5, [esp+0x1b0], m7 + paddd m0, m2 + paddd m1, m3 + paddd m0, m13 + paddd m1, m13 + paddd m4, m0 + paddd m5, m1 +%endif + psrad m4, rndshift + psrad m5, rndshift + packssdw m4, m5 +%ifidn %1, put + packuswb m4, m4 + movq [dstq], m4 + add dstq, dsm +%else + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep +%if ARCH_X86_64 + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+0x140], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .skip_line + mova m14, [base+unpckw] + movq m6, [srcq+r10] + movq m7, [srcq+r11] + movhps m6, [srcq+r13] + movhps m7, [srcq+ rX] + movq m4, [srcq+ r4] + movq m5, [srcq+ r6] + movhps m4, [srcq+ r7] + movhps m5, [srcq+ r9] + add srcq, ssq + mov myd, [rsp+0x140] + mov dyd, dym + pshufd m9, m14, q1032 + pshufb m0, m14 ; 0a 1a + pshufb m1, m14 ; 0b 1b + pshufb m2, m9 ; 3a 2a + pshufb m3, m9 ; 3b 2b + pmaddubsw m6, [rsp+0x30] + pmaddubsw m7, [rsp+0x40] + pmaddubsw m4, [rsp+0x10] + pmaddubsw m5, [rsp+0x20] + phaddw m6, m7 + phaddw m4, m5 + phaddw m4, m6 + pmulhrsw m4, m12 + pshufb m5, [rsp+0x50], m14 ; 4a 5a + pshufb m6, [rsp+0x60], m14 ; 4b 5b + pshufb m7, [rsp+0x70], m9 ; 7a 6a + pshufb m8, [rsp+0x80], m9 ; 7b 6b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m5 ; 34a + punpcklwd m3, m6 ; 34b + punpckhwd m5, m7 ; 56a + punpckhwd m6, m8 ; 56b + punpcklwd m7, m4 ; 78a + punpckhqdq m4, m4 + punpcklwd m8, m4 ; 78b + mova [rsp+0x50], m5 + mova [rsp+0x60], m6 + mova [rsp+0x70], m7 + mova [rsp+0x80], m8 + jmp .vloop +.skip_line: + mova m0, [rsp+0x10] + mova m1, [rsp+0x20] + mova m14, [rsp+0x30] + mova m15, [rsp+0x40] + MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15 + mov myd, [rsp+0x140] + mov dyd, dym + mova m0, m2 ; 01a + mova m1, m3 ; 01b + mova m2, [rsp+0x50] ; 23a + mova m3, [rsp+0x60] ; 23b + mova m5, [rsp+0x70] ; 45a + mova m6, [rsp+0x80] ; 45b + punpcklwd m7, m4, m8 ; 67a + punpckhwd m4, m8 ; 67b + mova [rsp+0x50], m5 + mova [rsp+0x60], m6 + mova [rsp+0x70], m7 + mova [rsp+0x80], m4 +%else + mov r0m, r0 + mov myd, mym + mov r3, r3m + add myd, dym + test myd, ~0x3ff + mov mym, myd + jnz .next_line + mova m0, [esp+0x140] + mova m1, [esp+0x150] + mova m2, [esp+0x160] + mova m3, [esp+0x170] + jmp .vloop +.next_line: + test myd, 0x400 + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + jz .skip_line + mova m6, [base+unpckw] + mova m0, [esp+0x140] + mova m1, [esp+0x150] + mova m7, [esp+0x180] + movq m4, [srcq+r0] + movq m5, [srcq+rX] + movhps m4, [srcq+r4] + movhps m5, [srcq+r5] + pshufb m0, m6 ; 0a 1a + pshufb m1, m6 ; 0b 1b + pshufb m7, m6 ; 4a 5a + mov r0, [esp+16] + mov rX, [esp+24] + mov r4, [esp+20] + mov r5, [esp+28] + movq m3, [srcq+r0] + movq m2, [srcq+rX] + movhps m3, [srcq+r4] + movhps m2, [srcq+r5] + add srcq, ssq + pmaddubsw m4, [esp+0x20] + pmaddubsw m5, [esp+0x30] + pmaddubsw m3, [esp+0x40] + pmaddubsw m2, [esp+0x50] + phaddw m4, m5 + phaddw m3, m2 + mova m5, [esp+0x190] + mova m2, [esp+0x160] + phaddw m4, m3 + mova m3, [esp+0x170] + pmulhrsw m4, m12 ; 8a 8b + mov myd, mym + pshufb m5, m6 ; 4b 5b + pshufd m6, m6, q1032 + pshufb m2, m6 ; 3a 2a + pshufb m3, m6 ; 3b 2b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + mova [esp+0x140], m0 + mova [esp+0x150], m1 + mova m0, [esp+0x1a0] + mova m1, [esp+0x1b0] + punpcklwd m2, m7 ; 34a + punpcklwd m3, m5 ; 34b + mova [esp+0x160], m2 + mova [esp+0x170], m3 + pshufb m0, m6 ; 7a 6a + pshufb m1, m6 ; 7b 6b + punpckhwd m7, m0 ; 56a + punpckhwd m5, m1 ; 56b + punpcklwd m0, m4 + punpckhqdq m4, m4 + punpcklwd m1, m4 + mova [esp+0x180], m7 + mova [esp+0x190], m5 + mova [esp+0x1a0], m0 + mova [esp+0x1b0], m1 + mova m0, [esp+0x140] + mova m1, [esp+0x150] + jmp .vloop +.skip_line: + MC_8TAP_SCALED_H 0x20, 0x1c0, 0 + mov myd, mym + mova m0, [esp+0x160] + mova m1, [esp+0x170] + mova m2, [esp+0x180] + mova m3, [esp+0x190] + mova [esp+0x140], m0 + mova [esp+0x150], m1 + mova m4, [esp+0x1a0] + mova m5, [esp+0x1b0] + mova [esp+0x160], m2 + mova [esp+0x170], m3 + mova m6, [esp+0x1c0] + mova m7, [esp+0x1d0] + mova [esp+0x180], m4 + mova [esp+0x190], m5 + punpcklwd m4, m6, m7 + punpckhwd m6, m7 + mova [esp+0x1a0], m4 + mova [esp+0x1b0], m6 +%endif + jmp .vloop +INIT_XMM ssse3 +.dy1: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy1_w2: + %if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d + %else + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 + movzx r5, byte [esp+0x1f0] + dec srcd + movd m15, r5 + %endif + punpckldq m9, m8 + SWAP m8, m9 + paddd m14, m8 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m11, [base+pd_0x4000] + %else + %define m11 [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + psrldq m15, 4 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_dw] + mova m6, [base+subpel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m9, m9 + pcmpeqd m8, m9 + psrld m14, 10 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [esp+0x00], m14 + %define m14 [esp+0x00] + SWAP m5, m0 + SWAP m6, m3 + %define m8 m5 + %define m15 m6 + %endif + movq m0, [srcq+ssq*0] + movq m2, [srcq+ssq*2] + movhps m0, [srcq+ssq*1] + movhps m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + %if ARCH_X86_64 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + movq m10, r4 + %else + mov myd, mym + mov r5, [esp+0x1f4] + xor r3, r3 + shr myd, 6 + lea r5, [r5+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r5*8+0] + cmovnz r3, [base+subpel_filters+r5*8+4] + %define m10 m4 + movd m10, r4 + movd m3, r3 + mov r3, r3m + punpckldq m10, m3 + %endif + movq m1, [srcq+ssq*0] + movq m3, [srcq+ssq*2] + movhps m1, [srcq+ssq*1] + add srcq, ss3q + punpcklbw m10, m10 + psraw m10, 8 + punpckldq m15, m7 + punpcklqdq m15, m15 + %if ARCH_X86_64 + pand m11, m8 + %else + pand m7, m11, m8 + %define m11 m7 + %endif + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + %if ARCH_X86_64 + pshufd m8, m10, q0000 + pshufd m9, m10, q1111 + pshufd m11, m10, q3333 + pshufd m10, m10, q2222 + %else + mova [esp+0x10], m15 + %define m15 [esp+0x10] + mov r0, r0m + pshufd m5, m4, q0000 + pshufd m6, m4, q1111 + pshufd m7, m4, q2222 + pshufd m4, m4, q3333 + %define m8 [esp+0x20] + %define m9 [esp+0x30] + %define m10 [esp+0x40] + %define m11 [esp+0x50] + mova m8, m5 + mova m9, m6 + mova m10, m7 + mova m11, m4 + %endif + pshufb m0, m14 + pshufb m2, m14 + pshufb m1, m14 + pshufb m3, m14 + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + palignr m2, m1, m0, 4 + pshufd m4, m1, q2121 + punpcklwd m3, m0, m2 ; 01 12 + punpckhwd m0, m2 ; 23 34 + punpcklwd m2, m1, m4 ; 45 56 +.dy1_w2_loop: + movq m1, [srcq+ssq*0] + movhps m1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd m5, m3, m8 + pmaddwd m6, m0, m9 + pmaddwd m7, m2, m10 + mova m3, m0 + mova m0, m2 + paddd m5, m13 + paddd m6, m7 + pshufb m1, m14 + pmaddubsw m1, m15 + phaddw m1, m1 + pmulhrsw m1, m12 + palignr m7, m1, m4, 12 + punpcklwd m2, m7, m1 ; 67 78 + pmaddwd m7, m2, m11 + mova m4, m1 + paddd m5, m6 + paddd m5, m7 + psrad m5, rndshift + packssdw m5, m5 + packuswb m5, m5 + movd r4d, m5 + mov [dstq+dsq*0], r4w + shr r4d, 16 + mov [dstq+dsq*1], r4w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +INIT_XMM ssse3 +.dy1_w4: +%if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d +%else + %define m10 [base+pd_0x3ff] + %define m11 [base+pd_0x4000] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %if isprep + %define ssq r3 + %endif + movzx r4, byte [esp+0x1f0] + dec srcq + movd m15, r4 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m11, [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + psrldq m7, m15, 8 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r6d, m15 + movd r13d, m7 + movd m15, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+r11*8+2] + movd m3, [base+subpel_filters+ r6*8+2] + movd m4, [base+subpel_filters+r13*8+2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] +%else + movd r1, m15 + movd r3, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r4, m15 + movd r5, m7 + %define m15 m5 + SWAP m4, m7 + movd m15, [base+subpel_filters+r1*8+2] + movd m2, [base+subpel_filters+r3*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m4, [base+subpel_filters+r5*8+2] + mov myd, mym + mov rX, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea rX, [rX+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+rX*8+0] + cmovnz r5, [base+subpel_filters+rX*8+4] + mov r3, r3m + %if isprep + lea ss3q, [ssq*3] + %endif +%endif + punpckldq m15, m3 + punpckldq m2, m4 + punpcklqdq m15, m2 + movq m6, [base+subpel_s_shuf2] +%if ARCH_X86_64 + pcmpeqd m8, m9 + psrld m14, 10 + pshufb m14, [base+bdct_lb_dw] + movu m0, [srcq+ssq*0] + movu m1, [srcq+ssq*1] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpcklqdq m6, m6 + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + movu m7, [srcq+ssq*2] + add srcq, ss3q + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + paddb m14, m6 + movq m10, r4q + punpcklbw m10, m10 + psraw m10, 8 + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb m5, m14 + pshufb m7, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + pmaddubsw m7, m15 + phaddw m0, m1 + phaddw m2, m3 + phaddw m4, m5 + phaddw m6, m7, m7 + pmulhrsw m0, m12 ; 0 1 + pmulhrsw m2, m12 ; 2 3 + pmulhrsw m4, m12 ; 4 5 + pmulhrsw m6, m12 ; 6 _ + shufps m1, m0, m2, q1032 ; 1 2 + shufps m3, m2, m4, q1032 ; 3 4 + shufps m5, m4, m6, q1032 ; 5 6 + punpcklwd m7, m0, m1 ; 01 + punpckhwd m0, m1 ; 12 + punpcklwd m8, m2, m3 ; 23 + punpckhwd m2, m3 ; 34 + punpcklwd m9, m4, m5 ; 45 + punpckhwd m4, m5 ; 56 +%else + pxor m3, m3 + pcmpeqd m8, m3 + psrld m14, 10 + pshufb m14, [base+bdct_lb_dw] + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + add srcq, ss3q + punpcklqdq m6, m6 + SWAP m4, m7 + pand m7, m11, m8 + pandn m8, m15 + SWAP m5, m0 + por m15, m7 + paddb m14, m6 + movu m0, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + pshufb m0, m14 + pshufb m7, m14 + pshufb m6, m14 + pmaddubsw m1, m15 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + mova [esp+0x00], m14 + mova [esp+0x10], m15 + pmaddubsw m0, m15 + pmaddubsw m7, m15 + pmaddubsw m6, m15 + phaddw m1, m2 + movu m2, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + mov r0, r0m + phaddw m3, m0 + pshufb m2, m14 + pmaddubsw m2, m15 + %define m14 [esp+0x00] + %define m15 [esp+0x10] + phaddw m7, m6 + phaddw m2, m2 + movd m6, r4 + movd m0, r5 + punpckldq m6, m0 + punpcklbw m6, m6 + psraw m6, 8 + mova [esp+0x20], m6 + pmulhrsw m1, m12 ; 0 1 + pmulhrsw m3, m12 ; 2 3 + pmulhrsw m7, m12 ; 4 5 + pmulhrsw m2, m12 ; 6 _ + shufps m0, m1, m3, q1032 ; 1 2 + shufps m4, m3, m7, q1032 ; 3 4 + shufps m5, m7, m2, q1032 ; 5 6 + punpcklwd m6, m1, m0 ; 01 + punpckhwd m1, m0 ; 12 + mova [esp+0x30], m1 + punpcklwd m1, m3, m4 ; 23 + punpckhwd m3, m4 ; 34 + mova [esp+0x40], m3 + punpcklwd m3, m7, m5 ; 45 + punpckhwd m7, m5 ; 56 + mova [esp+0x50], m7 + mova [esp+0x60], m2 + mova m0, [esp+0x20] + %xdefine m8 m1 + %xdefine m9 m3 + %xdefine m10 m0 + SWAP m7, m6 + SWAP m1, m4 + SWAP m3, m2 +%endif + pshufd m1, m10, q0000 + pshufd m3, m10, q1111 + pshufd m5, m10, q2222 + pshufd m10, m10, q3333 +%if ARCH_X86_64 + mova [rsp+0x00], m8 + mova [rsp+0x10], m2 + mova [rsp+0x20], m9 + mova [rsp+0x30], m4 +%else + mova [esp+0x70], m8 + mova [esp+0x80], m9 + mova [esp+0x90], m1 + mova [esp+0xa0], m3 + mova [esp+0xb0], m5 + mova [esp+0xc0], m10 + %ifidn %1, put + mov dsd, dsm + %endif + %define m11 m6 +%endif +.dy1_w4_loop: +%if ARCH_X86_64 + movu m11, [srcq+ssq*0] + pmaddwd m7, m1 + pmaddwd m8, m3 + pmaddwd m0, m1 + pmaddwd m2, m3 + pmaddwd m9, m5 + pmaddwd m4, m5 + paddd m7, m8 + paddd m0, m2 + movu m8, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m11, m14 + pmaddubsw m11, m15 + paddd m7, m13 + paddd m0, m13 + paddd m7, m9 + paddd m0, m4 + pshufb m8, m14 + pmaddubsw m8, m15 + phaddw m11, m8 + mova m8, [rsp+0x20] + pmulhrsw m11, m12 + punpcklwd m9, m6, m11 ; 67 + psrldq m6, m11, 8 + punpcklwd m4, m11, m6 ; 78 + pmaddwd m2, m9, m10 + pmaddwd m11, m4, m10 + paddd m7, m2 + mova m2, [rsp+0x30] + paddd m0, m11 +%else + SWAP m7, m6 + SWAP m1, m4 + SWAP m3, m2 + movu m5, [srcq+ssq*0] + mova m0, [esp+0x30] + mova m2, [esp+0x40] + mova m4, [esp+0x50] + pmaddwd m6, [esp+0x90] + pmaddwd m1, [esp+0xa0] + pmaddwd m0, [esp+0x90] + pmaddwd m2, [esp+0xa0] + pmaddwd m3, [esp+0xb0] + pmaddwd m4, [esp+0xb0] + paddd m6, m1 + paddd m0, m2 + movu m7, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pshufb m5, m14 + pmaddubsw m5, m15 + paddd m6, m13 + paddd m0, m13 + paddd m6, m3 + paddd m0, m4 + pshufb m7, m14 + pmaddubsw m7, m15 + phaddw m5, m7 + mova m7, [rsp+0x80] + pmulhrsw m5, m12 + punpcklwd m3, [esp+0x60], m5 ; 67 + psrldq m1, m5, 8 + punpcklwd m4, m5, m1 ; 78 + pmaddwd m2, m3, [esp+0xc0] + pmaddwd m5, m4, [esp+0xc0] + mova [esp+0x60], m1 + paddd m6, m2 + mova m2, [esp+0x50] + paddd m0, m5 + SWAP m7, m6 +%endif + psrad m7, rndshift + psrad m0, rndshift + packssdw m7, m0 +%if ARCH_X86_64 + mova m0, [rsp+0x10] +%else + mova m0, [esp+0x40] +%define m11 m5 +%endif +%ifidn %1, put + packuswb m7, m7 + psrldq m11, m7, 4 + movd [dstq+dsq*0], m7 + movd [dstq+dsq*1], m11 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], m7 + add tmpq, 16 +%endif + sub hd, 2 + jz .ret +%if ARCH_X86_64 + mova m7, [rsp+0x00] + mova [rsp+0x00], m8 + mova [rsp+0x10], m2 + mova [rsp+0x20], m9 + mova [rsp+0x30], m4 +%else + mova m7, [esp+0x70] ; 01 + mova m1, [esp+0x80] ; 23 + mova m2, [esp+0x50] ; 34 + mova [esp+0x30], m0 + mova [esp+0x70], m1 + mova [esp+0x40], m2 + mova [esp+0x80], m3 + mova [esp+0x50], m4 +%endif + jmp .dy1_w4_loop +INIT_XMM ssse3 +.dy1_w8: + mov dword [rsp+0x90], 1 + movifprep tmp_stridem, 16 + jmp .dy1_w_start +.dy1_w16: + mov dword [rsp+0x90], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+0x90], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+0x90], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+0x90], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif +%if ARCH_X86_64 + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d +%else + %define m8 m0 + %define m9 m1 + %xdefine m14 m4 + %xdefine m15 m3 + %if isprep + %define ssq ssm + %endif + mov r5, [esp+0x1f0] + mov r3, [esp+0x1f4] + shr r5, 16 + sub srcq, 3 + movd m15, r5 + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r0, r0m + mov r3, r3m +%endif + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] +%if ARCH_X86_64 + movq m3, r4q + punpcklbw m3, m3 + psraw m3, 8 +%else + movd m5, r4 + movd m6, r5 + punpckldq m5, m6 + punpcklbw m5, m5 + psraw m5, 8 + SWAP m3, m5 +%endif + mova [rsp+0x100], m7 + mova [rsp+0x120], m15 + mov [rsp+0x098], srcq + mov [rsp+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [rsp+0x140], m0 + mova [rsp+0x150], m1 + mova [rsp+0x160], m2 + mova [rsp+0x170], m3 +%if ARCH_X86_64 && UNIX64 + mov hm, hd +%elif ARCH_X86_32 + SWAP m5, m3 + mov r5, hm + mov [esp+0x134], r5 +%endif + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [rsp+0x090] + jz .ret +%if ARCH_X86_64 + add qword [rsp+0x130], 8*(isprep+1) + mov hd, hm +%else + add dword [rsp+0x130], 8*(isprep+1) + mov r5, [esp+0x134] + mov r0, [esp+0x130] +%endif + mova m7, [rsp+0x100] + mova m14, [rsp+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] +%else + %define m10 [base+pd_0x3ff] +%endif + mova m15, [rsp+0x120] + mov srcq, [rsp+0x098] +%if ARCH_X86_64 + mov r0q, [rsp+0x130] ; dstq / tmpq +%else + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.dy1_hloop: + pxor m9, m9 +%if ARCH_X86_64 + mova m11, [base+pq_0x40000000] +%else + %define m11 [base+pq_0x40000000] +%endif + psrld m2, m14, 10 + mova [rsp], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m9 + psrldq m2, m5, 8 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + psrldq m5, 4 + psrldq m2, 4 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + psrldq m5, 4 + psrldq m2, 4 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] + pxor m2, m2 + %define m9 m2 +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + mova [rsp+0x110], m14 + psrldq m4, m15, 8 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + psrldq m4, m14, 8 + movd r10d, m14 + movd r11d, m4 + psrldq m14, 4 + psrldq m4, 4 + movd r13d, m14 + movd rXd, m4 + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m8, m11, m4 + pand m9, m11, m6 + pand m15, m11, m7 + pand m11, m11, m5 + pandn m4, m0 + pandn m6, m1 + pandn m7, m2 + pandn m5, m3 + por m8, m4 + por m9, m6 + por m15, m7 + por m11, m5 + mova [rsp+0x10], m8 + mova [rsp+0x20], m9 + mova [rsp+0x30], m15 + mova [rsp+0x40], m11 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 + mova [rsp+0x50], m1 + mova [rsp+0x60], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 + mova [rsp+0x70], m3 + mova [rsp+0x80], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 + MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 + SWAP m7, m0 + SWAP m8, m14 + mova m1, [rsp+0x50] + mova m2, [rsp+0x60] + mova m3, [rsp+0x70] + mova m15, [rsp+0x80] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + SWAP m14, m8 + mova m8, [rsp+0x140] + mova m9, [rsp+0x150] + mova m10, [rsp+0x160] + mova m11, [rsp+0x170] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m15; 23a + punpckhwd m3, m15 ; 23b + mova [rsp+0x50], m4 + mova [rsp+0x60], m5 + mova [rsp+0x70], m6 + mova [rsp+0x80], m7 + mova m14, [base+unpckw] +%else + movd r0, m15 + movd rX, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r4, m15 + movd r5, m4 + mova m14, [esp+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [esp+16], m14 + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m11, m4 + pand m1, m11, m6 + pand m2, m11, m7 + pand m3, m11, m5 + pandn m4, [esp+0x20] + pandn m6, [esp+0x30] + pandn m7, [esp+0x40] + pandn m5, [esp+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 + MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 + MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 + MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 + mova m5, [esp+0x1a0] + mova m6, [esp+0x1b0] + mova m7, [esp+0x1c0] + mova m0, [esp+0x1d0] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [esp+0x1a0], m4 + mova [esp+0x1b0], m5 + mova [esp+0x1c0], m6 + mova [esp+0x1d0], m7 + mova m1, [esp+0x060] + mova m2, [esp+0x070] + mova m3, [esp+0x180] + mova m4, [esp+0x190] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova [esp+0x060], m0 + mova [esp+0x070], m1 + mova [esp+0x180], m2 + mova [esp+0x190], m3 + %define m8 [esp+0x140] + %define m9 [esp+0x150] + %define m10 [esp+0x160] + %define m11 [esp+0x170] +%endif +.dy1_vloop: +%if ARCH_X86_32 + mov r0, r0m +%endif + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + pmaddwd m7, m3, m9 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 +%if ARCH_X86_64 + pmaddwd m6, [rsp+0x50], m10 + pmaddwd m7, [rsp+0x60], m10 +%else + pmaddwd m6, [rsp+0x1a0], m10 + pmaddwd m7, [rsp+0x1b0], m10 +%endif + paddd m4, m6 + paddd m5, m7 +%if ARCH_X86_64 + pmaddwd m6, [rsp+0x70], m11 + pmaddwd m7, [rsp+0x80], m11 +%else + pmaddwd m6, [rsp+0x1c0], m11 + pmaddwd m7, [rsp+0x1d0], m11 +%endif + paddd m4, m6 + paddd m5, m7 + psrad m4, rndshift + psrad m5, rndshift + packssdw m4, m5 +%ifidn %1, put + packuswb m4, m4 + movq [dstq], m4 + add dstq, dsm +%else + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif +%if ARCH_X86_32 + mov r0m, r0 +%endif + dec hd + jz .dy1_hloop_prep +%if ARCH_X86_64 + movq m4, [srcq+ r4] + movq m5, [srcq+ r6] + movhps m4, [srcq+ r7] + movhps m5, [srcq+ r9] + movq m6, [srcq+r10] + movq m7, [srcq+r11] + movhps m6, [srcq+r13] + movhps m7, [srcq+ rX] + add srcq, ssq + pshufd m15, m14, q1032 + pshufb m0, m14 ; 0a 1a + pshufb m1, m14 ; 0b 1b + pshufb m2, m15 ; 3a 2a + pshufb m3, m15 ; 3b 2b + pmaddubsw m4, [rsp+0x10] + pmaddubsw m5, [rsp+0x20] + pmaddubsw m6, [rsp+0x30] + pmaddubsw m7, [rsp+0x40] + phaddw m4, m5 + phaddw m6, m7 + phaddw m4, m6 + pmulhrsw m4, m12 + pshufb m5, [rsp+0x70], m15 ; 7a 6a + pshufb m7, [rsp+0x80], m15 ; 7b 6b + pshufb m6, [rsp+0x50], m14 ; 4a 5a + pshufb m15, [rsp+0x60], m14 ; 4b 5b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + punpcklwd m2, m6 ; 34a + punpcklwd m3, m15 ; 34b + punpckhwd m6, m5 ; 56a + punpckhwd m15, m7 ; 56b + punpcklwd m5, m4 ; 78a + psrldq m4, 8 + punpcklwd m7, m4 ; 78b + mova [rsp+0x50], m6 + mova [rsp+0x60], m15 + mova [rsp+0x70], m5 + mova [rsp+0x80], m7 +%else + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + mova m6, [base+unpckw] + mova m0, [esp+0x060] + mova m1, [esp+0x070] + mova m7, [esp+0x1a0] + movq m4, [srcq+r0] + movq m5, [srcq+rX] + movhps m4, [srcq+r4] + movhps m5, [srcq+r5] + pshufb m0, m6 ; 0a 1a + pshufb m1, m6 ; 0b 1b + pshufb m7, m6 ; 4a 5a + mov r0, [esp+16] + mov rX, [esp+24] + mov r4, [esp+20] + mov r5, [esp+28] + movq m3, [srcq+r0] + movq m2, [srcq+rX] + movhps m3, [srcq+r4] + movhps m2, [srcq+r5] + add srcq, ssq + pmaddubsw m4, [esp+0x20] + pmaddubsw m5, [esp+0x30] + pmaddubsw m3, [esp+0x40] + pmaddubsw m2, [esp+0x50] + phaddw m4, m5 + phaddw m3, m2 + mova m5, [esp+0x1b0] + mova m2, [esp+0x180] + phaddw m4, m3 + mova m3, [esp+0x190] + pmulhrsw m4, m12 ; 8a 8b + pshufb m5, m6 ; 4b 5b + pshufd m6, m6, q1032 + pshufb m2, m6 ; 3a 2a + pshufb m3, m6 ; 3b 2b + punpckhwd m0, m2 ; 12a + punpckhwd m1, m3 ; 12b + mova [esp+0x60], m0 + mova [esp+0x70], m1 + mova m0, [esp+0x1c0] + mova m1, [esp+0x1d0] + punpcklwd m2, m7 ; 34a + punpcklwd m3, m5 ; 34b + mova [esp+0x180], m2 + mova [esp+0x190], m3 + pshufb m0, m6 ; 7a 6a + pshufb m1, m6 ; 7b 6b + punpckhwd m7, m0 ; 56a + punpckhwd m5, m1 ; 56b + punpcklwd m0, m4 + punpckhqdq m4, m4 + punpcklwd m1, m4 + mova [esp+0x1a0], m7 + mova [esp+0x1b0], m5 + mova [esp+0x1c0], m0 + mova [esp+0x1d0], m1 + mova m0, [esp+0x60] + mova m1, [esp+0x70] +%endif + jmp .dy1_vloop +INIT_XMM ssse3 +.dy2: + movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy2_w2: + %if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d + %else + %define m10 [base+pd_0x3ff] + %define m11 [base+pd_0x4000] + %define m8 m0 + %define m9 m1 + %define m14 m4 + %define m15 m3 + movzx r5, byte [esp+0x1f0] + dec srcd + movd m15, r5 + %endif + punpckldq m9, m8 + SWAP m8, m9 + paddd m14, m8 ; mx+dx*[0-1] + %if ARCH_X86_64 + mova m11, [base+pd_0x4000] + %endif + pshufd m15, m15, q0000 + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + movd r4d, m15 + psrldq m15, 4 + %if ARCH_X86_64 + movd r6d, m15 + %else + movd r3d, m15 + %endif + mova m5, [base+bdct_lb_dw] + mova m6, [base+subpel_s_shuf2] + movd m15, [base+subpel_filters+r4*8+2] + %if ARCH_X86_64 + movd m7, [base+subpel_filters+r6*8+2] + %else + movd m7, [base+subpel_filters+r3*8+2] + %endif + pxor m9, m9 + pcmpeqd m8, m9 + psrld m14, 10 + %if ARCH_X86_32 + mov r3, r3m + pshufb m14, m5 + paddb m14, m6 + mova [esp+0x00], m14 + %define m14 [esp+0x00] + SWAP m5, m0 + SWAP m6, m3 + %define m8 m5 + %define m15 m6 + %endif + movq m0, [srcq+ssq*0] + movq m1, [srcq+ssq*1] + movhps m0, [srcq+ssq*2] + movhps m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + %if ARCH_X86_64 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + movq m10, r4q + %else + mov myd, mym + mov r3, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r3, r3m + %define m10 m4 + movd m10, r4 + movd m3, r5 + punpckldq m10, m3 + %endif + movq m3, [srcq+ssq*0] + movhps m3, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + punpcklbw m10, m10 + psraw m10, 8 + punpckldq m15, m7 + punpcklqdq m15, m15 + %if ARCH_X86_64 + pand m11, m8 + %else + pand m7, m11, m8 + %define m11 m7 + %endif + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + %if ARCH_X86_64 + pshufd m8, m10, q0000 + pshufd m9, m10, q1111 + pshufd m11, m10, q3333 + pshufd m10, m10, q2222 + %else + mova [esp+0x10], m15 + %define m15 [esp+0x10] + mov r5, r0m + %define dstq r5 + mov dsd, dsm + pshufd m5, m4, q0000 + pshufd m6, m4, q1111 + pshufd m7, m4, q2222 + pshufd m4, m4, q3333 + %define m8 [esp+0x20] + %define m9 [esp+0x30] + %define m10 [esp+0x40] + %define m11 [esp+0x50] + mova m8, m5 + mova m9, m6 + mova m10, m7 + mova m11, m4 + %endif + pshufb m0, m14 + pshufb m1, m14 + pshufb m3, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + pslldq m2, m3, 8 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 2 _ 4 + pmulhrsw m1, m12 ; 1 3 _ 5 + pshufd m2, m0, q3110 ; 0 2 2 4 + pshufd m1, m1, q3110 ; 1 3 3 5 + punpcklwd m3, m2, m1 ; 01 23 + punpckhwd m2, m1 ; 23 45 +.dy2_w2_loop: + movq m6, [srcq+ssq*0] + movq m7, [srcq+ssq*1] + movhps m6, [srcq+ssq*2] + movhps m7, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd m4, m3, m8 + pmaddwd m5, m2, m9 + pshufb m6, m14 + pshufb m7, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + phaddw m6, m7 + pmulhrsw m6, m12 + psrldq m7, m6, 8 + palignr m6, m0, 8 + palignr m7, m1, 8 + mova m0, m6 + mova m1, m7 + pshufd m6, m6, q3221 + pshufd m7, m7, q3221 + punpcklwd m3, m6, m7 ; 45 67 + punpckhwd m2, m6, m7 ; 67 89 + pmaddwd m6, m3, m10 + pmaddwd m7, m2, m11 + paddd m4, m5 + paddd m4, m13 + paddd m6, m7 + paddd m4, m6 + psrad m4, rndshift + packssdw m4, m4 + packuswb m4, m4 + movd r4d, m4 + mov [dstq+dsq*0], r4w + shr r4d, 16 + mov [dstq+dsq*1], r4w + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +INIT_XMM ssse3 +.dy2_w4: +%if ARCH_X86_64 + mov myd, mym + movzx t0d, t0b + dec srcq + movd m15, t0d +%else + %define m10 [base+pd_0x3ff] + %define m11 [base+pd_0x4000] + %define m8 m0 + %xdefine m14 m4 + %define m15 m3 + %define dstq r0 + %if isprep + %define ssq r3 + %endif + movzx r4, byte [esp+0x1f0] + dec srcq + movd m15, r4 +%endif + pmaddwd m8, [base+rescale_mul] +%if ARCH_X86_64 + mova m11, [base+pd_0x4000] +%endif + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd m15, m8 + psrldq m7, m15, 8 +%if ARCH_X86_64 + movd r4d, m15 + movd r11d, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r6d, m15 + movd r13d, m7 + movd m15, [base+subpel_filters+ r4*8+2] + movd m2, [base+subpel_filters+r11*8+2] + movd m3, [base+subpel_filters+ r6*8+2] + movd m4, [base+subpel_filters+r13*8+2] + movq m6, [base+subpel_s_shuf2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] +%else + movd r1, m15 + movd r3, m7 + psrldq m15, 4 + psrldq m7, 4 + movd r4, m15 + movd r5, m7 + %define m15 m5 + SWAP m4, m7 + movd m15, [base+subpel_filters+r1*8+2] + movd m2, [base+subpel_filters+r3*8+2] + movd m3, [base+subpel_filters+r4*8+2] + movd m4, [base+subpel_filters+r5*8+2] + movq m6, [base+subpel_s_shuf2] + mov myd, mym + mov r3, [esp+0x1f4] + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r3, r3m + %if isprep + lea ss3q, [ssq*3] + %endif +%endif + punpckldq m15, m3 + punpckldq m2, m4 + punpcklqdq m15, m2 +%if ARCH_X86_64 + pcmpeqd m8, m9 + psrld m14, 10 + movu m0, [srcq+ssq*0] + movu m2, [srcq+ssq*2] + movu m1, [srcq+ssq*1] + movu m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + punpcklqdq m6, m6 + pshufb m14, [base+bdct_lb_dw] + movu m4, [srcq+ssq*0] + movu m5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pand m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m11 + paddb m14, m6 + movq m11, r4q + punpcklbw m11, m11 + psraw m11, 8 + pshufb m0, m14 + pshufb m2, m14 + pshufb m1, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb m5, m14 + pmaddubsw m0, m15 + pmaddubsw m2, m15 + pmaddubsw m1, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m0, m2 + phaddw m1, m3 + phaddw m4, m5 + pmulhrsw m0, m12 ; 0 2 + pmulhrsw m1, m12 ; 1 3 + pmulhrsw m4, m12 ; 4 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 +%else + pxor m3, m3 + pcmpeqd m8, m3 + psrld m14, 10 + pshufb m14, [base+bdct_lb_dw] + movu m1, [srcq+ssq*0] + movu m2, [srcq+ssq*2] + movu m3, [srcq+ssq*1] + add srcq, ss3q + punpcklqdq m6, m6 + SWAP m4, m7 + pand m7, m11, m8 + pandn m8, m15 + SWAP m15, m8 + por m15, m7 + paddb m14, m6 + movu m0, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m6, [srcq+ssq*2] + add srcq, ss3q + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + pshufb m0, m14 + pshufb m7, m14 + pshufb m6, m14 + pmaddubsw m1, m15 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + mova [esp+0x00], m14 + mova [esp+0x10], m15 + pmaddubsw m0, m15 + pmaddubsw m7, m15 + pmaddubsw m6, m15 + %define m14 [esp+0x00] + %define m15 [esp+0x10] + phaddw m1, m2 + phaddw m3, m0 + phaddw m7, m6 + %ifidn %1, put + mov dsd, dsm + %define dstq r5 + %else + %define tmpq r5 + %endif + movd m6, r4 + movd m0, r5 + punpckldq m6, m0 + punpcklbw m6, m6 + psraw m6, 8 + mov r5, r0m + pmulhrsw m1, m12 ; 0 2 + pmulhrsw m3, m12 ; 1 3 + pmulhrsw m7, m12 ; 4 5 + SWAP m0, m1, m3 + SWAP m4, m7 + pshufd m2, m6, q0000 + pshufd m3, m6, q1111 + pshufd m7, m6, q2222 + pshufd m6, m6, q3333 + mova [esp+0x30], m2 + mova [esp+0x40], m3 + mova [esp+0x50], m7 + mova [esp+0x60], m6 + %define m8 [esp+0x30] + %define m9 [esp+0x40] + %define m10 [esp+0x50] + %define m11 [esp+0x60] +%endif + psrldq m5, m4, 8 ; 5 _ + punpckhwd m2, m0, m1 ; 23 + punpcklwd m0, m1 ; 01 + punpcklwd m4, m5 ; 45 +.dy2_w4_loop: + pmaddwd m0, m8 ; a0 + pmaddwd m5, m2, m8 ; b0 + pmaddwd m2, m9 ; a1 + pmaddwd m7, m4, m9 ; b1 + pmaddwd m3, m4, m10 ; a2 + paddd m0, m13 + paddd m5, m13 + paddd m0, m2 + paddd m5, m7 + paddd m0, m3 + movu m6, [srcq+ssq*0] + movu m7, [srcq+ssq*1] + movu m3, [srcq+ssq*2] + movu m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m6, m14 + pshufb m7, m14 + pshufb m3, m14 + pshufb m1, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + pmaddubsw m3, m15 + pmaddubsw m1, m15 + phaddw m6, m7 + phaddw m3, m1 + pmulhrsw m6, m12 ; 6 7 + pmulhrsw m3, m12 ; 8 9 + psrldq m7, m6, 8 + psrldq m1, m3, 8 + punpcklwd m6, m7 ; 67 + punpcklwd m3, m1 ; 89 + mova m2, m6 + pmaddwd m1, m6, m10 ; b2 + pmaddwd m6, m11 ; a3 + pmaddwd m7, m3, m11 ; b3 + paddd m5, m1 + paddd m0, m6 + paddd m5, m7 + psrad m0, rndshift + psrad m5, rndshift + packssdw m0, m5 +%ifidn %1, put + packuswb m0, m0 + psrldq m1, m0, 4 + movd [dstq+dsq*0], m0 + movd [dstq+dsq*1], m1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], m0 + add tmpq, 16 +%endif + mova m0, m4 + mova m4, m3 + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET +INIT_XMM ssse3 +.dy2_w8: + mov dword [rsp+0x90], 1 + movifprep tmp_stridem, 16 + jmp .dy2_w_start +.dy2_w16: + mov dword [rsp+0x90], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+0x90], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+0x90], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+0x90], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: + mov myd, mym +%ifidn %1, put + movifnidn dsm, dsq +%endif +%if ARCH_X86_64 + shr t0d, 16 + sub srcq, 3 + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movd m15, t0d +%else + %define m10 [base+pd_0x3ff] + %define m11 [base+pd_0x4000] + %define m8 m0 + %define m9 m1 + %xdefine m14 m4 + %xdefine m15 m3 + %if isprep + %define tmpq r0 + %define ssq ssm + %else + %define dstq r0 + %endif + mov r5, [esp+0x1f0] + mov r3, [esp+0x1f4] + shr r5, 16 + sub srcq, 3 + movd m15, r5 + xor r5, r5 + shr myd, 6 + lea r3, [r3+myd] + mov r4, 64 << 24 + cmovnz r4, [base+subpel_filters+r3*8+0] + cmovnz r5, [base+subpel_filters+r3*8+4] + mov r0, r0m + mov r3, r3m +%endif + pslld m7, m8, 2 ; dx*4 + pmaddwd m8, [base+rescale_mul] ; dx*[0-3] + pshufd m15, m15, q0000 + paddd m14, m8 ; mx+dx*[0-3] +%if ARCH_X86_64 + movq m3, r4q + punpcklbw m3, m3 + psraw m3, 8 +%else + movd m5, r4 + movd m6, r5 + punpckldq m5, m6 + punpcklbw m5, m5 + psraw m5, 8 + SWAP m3, m5 +%endif + mova [rsp+0x100], m7 + mova [rsp+0x120], m15 + mov [rsp+0x098], srcq + mov [rsp+0x130], r0q ; dstq / tmpq + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + mova [rsp+0x140], m0 + mova [rsp+0x150], m1 + mova [rsp+0x160], m2 + mova [rsp+0x170], m3 +%if ARCH_X86_64 && UNIX64 + mov hm, hd +%elif ARCH_X86_32 + SWAP m5, m3 + mov r5, hm + mov [esp+0x134], r5 +%endif + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [rsp+0x090] + jz .ret +%if ARCH_X86_64 + add qword [rsp+0x130], 8*(isprep+1) + mov hd, hm +%else + add dword [rsp+0x130], 8*(isprep+1) + mov r5, [esp+0x134] + mov r0, [esp+0x130] +%endif + mova m7, [rsp+0x100] + mova m14, [rsp+0x110] +%if ARCH_X86_64 + mova m10, [base+pd_0x3ff] +%else + %define m10 [base+pd_0x3ff] +%endif + mova m15, [rsp+0x120] + mov srcq, [rsp+0x098] +%if ARCH_X86_64 + mov r0q, [rsp+0x130] ; dstq / tmpq +%else + mov hm, r5 + mov r0m, r0 + mov r3, r3m +%endif + paddd m14, m7 +.dy2_hloop: + pxor m9, m9 +%if ARCH_X86_64 + mova m11, [base+pq_0x40000000] +%else + %define m11 [base+pq_0x40000000] +%endif + psrld m2, m14, 10 + mova [rsp], m2 + pand m6, m14, m10 + psrld m6, 6 + paddd m5, m15, m6 + pcmpeqd m6, m9 + psrldq m2, m5, 8 +%if ARCH_X86_64 + movd r4d, m5 + movd r6d, m2 + psrldq m5, 4 + psrldq m2, 4 + movd r7d, m5 + movd r9d, m2 + movq m0, [base+subpel_filters+r4*8] + movq m1, [base+subpel_filters+r6*8] + movhps m0, [base+subpel_filters+r7*8] + movhps m1, [base+subpel_filters+r9*8] +%else + movd r0, m5 + movd rX, m2 + psrldq m5, 4 + psrldq m2, 4 + movd r4, m5 + movd r5, m2 + movq m0, [base+subpel_filters+r0*8] + movq m1, [base+subpel_filters+rX*8] + movhps m0, [base+subpel_filters+r4*8] + movhps m1, [base+subpel_filters+r5*8] + pxor m2, m2 + %define m9 m2 +%endif + paddd m14, m7 ; mx+dx*[4-7] + pand m5, m14, m10 + psrld m5, 6 + paddd m15, m5 + pcmpeqd m5, m9 + mova [rsp+0x110], m14 + psrldq m4, m15, 8 +%if ARCH_X86_64 + movd r10d, m15 + movd r11d, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r13d, m15 + movd rXd, m4 + movq m2, [base+subpel_filters+r10*8] + movq m3, [base+subpel_filters+r11*8] + movhps m2, [base+subpel_filters+r13*8] + movhps m3, [base+subpel_filters+ rX*8] + psrld m14, 10 + psrldq m4, m14, 8 + movd r10d, m14 + movd r11d, m4 + psrldq m14, 4 + psrldq m4, 4 + movd r13d, m14 + movd rXd, m4 + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m8, m11, m4 + pand m9, m11, m6 + pand m15, m11, m7 + pand m11, m11, m5 + pandn m4, m0 + pandn m6, m1 + pandn m7, m2 + pandn m5, m3 + por m8, m4 + por m9, m6 + por m15, m7 + por m11, m5 + mova [rsp+0x10], m8 + mova [rsp+0x20], m9 + mova [rsp+0x30], m15 + mova [rsp+0x40], m11 + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 + mova [rsp+0x50], m1 + mova [rsp+0x60], m2 + MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 + mova [rsp+0x70], m3 + mova [rsp+0x80], m4 + MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 + MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 + SWAP m7, m0 + SWAP m8, m14 + mova m1, [rsp+0x50] + mova m2, [rsp+0x60] + mova m3, [rsp+0x70] + mova m15, [rsp+0x80] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m8 ; 67a + punpckhwd m7, m8 ; 67b + SWAP m14, m8 + mova m8, [rsp+0x140] + mova m9, [rsp+0x150] + mova m10, [rsp+0x160] + mova m11, [rsp+0x170] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m15; 23a + punpckhwd m3, m15 ; 23b + mova [rsp+0x50], m4 + mova [rsp+0x60], m5 + mova [rsp+0x70], m6 + mova [rsp+0x80], m7 +%else + movd r0, m15 + movd rX, m4 + psrldq m15, 4 + psrldq m4, 4 + movd r4, m15 + movd r5, m4 + mova m14, [esp+0x110] + movq m2, [base+subpel_filters+r0*8] + movq m3, [base+subpel_filters+rX*8] + movhps m2, [base+subpel_filters+r4*8] + movhps m3, [base+subpel_filters+r5*8] + psrld m14, 10 + mova [esp+16], m14 + mov r0, [esp+ 0] + mov rX, [esp+ 8] + mov r4, [esp+ 4] + mov r5, [esp+12] + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + pshufd m4, m6, q1100 + pshufd m6, m6, q3322 + pshufd m7, m5, q1100 + pshufd m5, m5, q3322 + pand m0, m11, m4 + pand m1, m11, m6 + pand m2, m11, m7 + pand m3, m11, m5 + pandn m4, [esp+0x20] + pandn m6, [esp+0x30] + pandn m7, [esp+0x40] + pandn m5, [esp+0x50] + por m0, m4 + por m1, m6 + por m2, m7 + por m3, m5 + mova [esp+0x20], m0 + mova [esp+0x30], m1 + mova [esp+0x40], m2 + mova [esp+0x50], m3 + MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 + MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 + MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 + MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 + mova m5, [esp+0x1a0] + mova m6, [esp+0x1b0] + mova m7, [esp+0x1c0] + mova m0, [esp+0x1d0] + punpcklwd m4, m5, m6 ; 45a + punpckhwd m5, m6 ; 45b + punpcklwd m6, m7, m0 ; 67a + punpckhwd m7, m0 ; 67b + mova [esp+0x1a0], m4 + mova [esp+0x1b0], m5 + mova [esp+0x1c0], m6 + mova [esp+0x1d0], m7 + mova m1, [esp+0x060] + mova m2, [esp+0x070] + mova m3, [esp+0x180] + mova m4, [esp+0x190] + punpcklwd m0, m1, m2 ; 01a + punpckhwd m1, m2 ; 01b + punpcklwd m2, m3, m4 ; 23a + punpckhwd m3, m4 ; 23b + mova [esp+0x180], m2 + mova [esp+0x190], m3 + %define m8 [esp+0x140] + %define m9 [esp+0x150] + %define m10 [esp+0x160] + %define m11 [esp+0x170] +%endif +.dy2_vloop: +%if ARCH_X86_32 + mov r0, r0m +%endif + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + pmaddwd m7, m3, m9 + paddd m4, m13 + paddd m5, m13 + paddd m4, m6 + paddd m5, m7 +%if ARCH_X86_64 + pmaddwd m6, [rsp+0x50], m10 + pmaddwd m7, [rsp+0x60], m10 +%else + pmaddwd m6, [esp+0x1a0], m10 + pmaddwd m7, [esp+0x1b0], m10 +%endif + paddd m4, m6 + paddd m5, m7 +%if ARCH_X86_64 + pmaddwd m6, [rsp+0x70], m11 + pmaddwd m7, [rsp+0x80], m11 +%else + pmaddwd m6, [esp+0x1c0], m11 + pmaddwd m7, [esp+0x1d0], m11 +%endif + paddd m4, m6 + paddd m5, m7 + psrad m4, rndshift + psrad m5, rndshift + packssdw m4, m5 +%ifidn %1, put + packuswb m4, m4 + movq [dstq], m4 + add dstq, dsm +%else + mova [tmpq], m4 + add tmpq, tmp_stridem +%endif +%if ARCH_X86_32 + mov r0m, r0 +%endif + dec hd + jz .dy2_hloop_prep +%if ARCH_X86_64 + mova m8, [rsp+0x10] + mova m9, [rsp+0x20] + mova m10, [rsp+0x30] + mova m11, [rsp+0x40] + mova m0, m2 ; 01a + mova m1, m3 ; 01b + MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11 + mova m3, [rsp+0x50] ; 23a + mova m4, [rsp+0x60] ; 23b + mova m5, [rsp+0x70] ; 45a + mova m7, [rsp+0x80] ; 45b + mova m8, [rsp+0x140] + mova m9, [rsp+0x150] + mova m10, [rsp+0x160] + mova m11, [rsp+0x170] + punpcklwd m14, m2, m6 ; 67a + punpckhwd m2, m6 ; 67b + mova [rsp+0x50], m5 + mova [rsp+0x60], m7 + mova [rsp+0x70], m14 + mova [rsp+0x80], m2 + mova m2, m3 + mova m3, m4 +%else + MC_8TAP_SCALED_H 0x20, 0 + punpcklwd m6, m0, m4 + punpckhwd m7, m0, m4 + mova m0, [esp+0x180] ; 01a + mova m1, [esp+0x190] ; 01b + mova m2, [rsp+0x1a0] ; 23a + mova m3, [esp+0x1b0] ; 23b + mova m4, [esp+0x1c0] ; 45a + mova m5, [esp+0x1d0] ; 45b + mova [esp+0x180], m2 + mova [esp+0x190], m3 + mova [esp+0x1a0], m4 + mova [esp+0x1b0], m5 + mova [esp+0x1c0], m6 ; 67a + mova [esp+0x1d0], m7 ; 67b +%endif + jmp .dy2_vloop +.ret: + MC_8TAP_SCALED_RET 0 +%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT + %define r0m [rstk+stack_offset+ 4] + %define r1m [rstk+stack_offset+ 8] + %define r2m [rstk+stack_offset+12] + %define r3m [rstk+stack_offset+16] +%endif +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled_8bpc + mov t0d, (5*15 << 16) | 5*15 + mov t1d, (5*15 << 16) | 5*15 + jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%elif ARCH_X86_64 +DECLARE_REG_TMP 6, 8 +%else +DECLARE_REG_TMP 1, 2 +%endif +BILIN_SCALED_FN put +FN put_8tap_scaled, sharp, SHARP, SHARP +FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN put_8tap_scaled, smooth, SMOOTH, SMOOTH +FN put_8tap_scaled, sharp_regular, SHARP, REGULAR +FN put_8tap_scaled, regular_sharp, REGULAR, SHARP +FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN put_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%elif ARCH_X86_64 +DECLARE_REG_TMP 6, 7 +%else +DECLARE_REG_TMP 1, 2 +%endif +BILIN_SCALED_FN prep +FN prep_8tap_scaled, sharp, SHARP, SHARP +FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH +FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP +FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH +FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR +FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP +FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR +FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH +FN prep_8tap_scaled, regular, REGULAR, REGULAR +MC_8TAP_SCALED prep + +%if ARCH_X86_32 + %macro SAVE_ALPHA_BETA 0 + mov alpham, alphad + mov betam, betad + %endmacro + + %macro SAVE_DELTA_GAMMA 0 + mov deltam, deltad + mov gammam, gammad + %endmacro + + %macro LOAD_ALPHA_BETA_MX 0 + mov mym, myd + mov alphad, alpham + mov betad, betam + mov mxd, mxm + %endmacro + + %macro LOAD_DELTA_GAMMA_MY 0 + mov mxm, mxd + mov deltad, deltam + mov gammad, gammam + mov myd, mym + %endmacro + + %define PIC_reg r2 + %define PIC_base_offset $$ + %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) +%else + %define SAVE_ALPHA_BETA + %define SAVE_DELTA_GAMMA + %define PIC_sym(sym) sym +%endif + +%if ARCH_X86_32 + %if STACK_ALIGNMENT < required_stack_alignment + %assign copy_args 8*4 + %else + %assign copy_args 0 + %endif +%endif + +%macro RELOC_ARGS 0 + %if copy_args + mov r0, r0m + mov r1, r1m + mov r2, r2m + mov r3, r3m + mov r5, r5m + mov dstm, r0 + mov dsm, r1 + mov srcm, r2 + mov ssm, r3 + mov mxm, r5 + mov r0, r6m + mov mym, r0 + %endif +%endmacro + +%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2 + %if cpuflag(sse4) + pblendw %1, %2, 0xAA + %else + pand %2, m10 + por %1, %2 + %endif +%endmacro + +%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7 + %if ARCH_X86_32 + %define m8 m4 + %define m9 m5 + %define m14 m6 + %define m15 m7 + %define m11 m7 + %endif + %if notcpuflag(ssse3) || ARCH_X86_32 + pxor m11, m11 + %endif + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq m2, [filterq+myq *8] ; a + movq m8, [filterq+tmp1q*8] ; e + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+deltaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movq m3, [filterq+tmp2q*8] ; b + movq m0, [filterq+tmp1q*8] ; f + punpcklwd m2, m3 + punpcklwd m8, m0 + lea tmp1d, [myq+deltaq*4] + lea tmp2d, [myq+deltaq*1] + shr myd, 10 + shr tmp1d, 10 + movq m0, [filterq+myq *8] ; c + movq m9, [filterq+tmp1q*8] ; g + lea tmp1d, [tmp2q+deltaq*4] + lea myd, [tmp2q+gammaq] ; my += gamma + shr tmp2d, 10 + shr tmp1d, 10 + movq m3, [filterq+tmp2q*8] ; d + movq m1, [filterq+tmp1q*8] ; h + punpcklwd m0, m3 + punpcklwd m9, m1 + punpckldq m1, m2, m0 + punpckhdq m2, m0 + punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 + punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 + punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 + punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 + pmaddwd m0, %3 + pmaddwd m3, %5 + pmaddwd m1, %7 + pmaddwd m14, %9 + paddd m0, m3 + paddd m1, m14 + paddd m0, m1 + mova %1, m0 + %if ARCH_X86_64 + SWAP m3, m14 + %endif + punpckldq m0, m8, m9 + punpckhdq m8, m9 + punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8 + punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8 + punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8 + punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8 + pmaddwd m1, %4 + pmaddwd m14, %6 + pmaddwd m2, %8 + pmaddwd m15, %10 + paddd m1, m14 + paddd m2, m15 + paddd m1, m2 + mova %2, m1 + %if ARCH_X86_64 + SWAP m14, m3 + %endif +%endmacro + +%if ARCH_X86_64 + %define counterd r4d +%else + %if copy_args == 0 + %define counterd dword r4m + %else + %define counterd dword [esp+stack_size-4*7] + %endif +%endif + +%macro WARP_AFFINE_8X8T 0 +%if ARCH_X86_64 +cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts +%else +cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts + %if copy_args + %define tmpm [esp+stack_size-4*1] + %define tsm [esp+stack_size-4*2] + %endif +%endif + call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main +.loop: +%if ARCH_X86_32 + %define m12 m4 + %define m13 m5 + %define m14 m6 + %define m15 m7 + mova m12, [esp+0xC0] + mova m13, [esp+0xD0] + mova m14, [esp+0xE0] + mova m15, [esp+0xF0] +%endif +%if cpuflag(ssse3) + psrad m12, 13 + psrad m13, 13 + psrad m14, 13 + psrad m15, 13 + packssdw m12, m13 + packssdw m14, m15 + mova m13, [PIC_sym(pw_8192)] + pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7 + pmulhrsw m14, m13 +%else + %if ARCH_X86_32 + %define m10 m0 + %endif + mova m10, [PIC_sym(pd_16384)] + paddd m12, m10 + paddd m13, m10 + paddd m14, m10 + paddd m15, m10 + psrad m12, 15 + psrad m13, 15 + psrad m14, 15 + psrad m15, 15 + packssdw m12, m13 + packssdw m14, m15 +%endif + mova [tmpq+tsq*0], m12 + mova [tmpq+tsq*2], m14 + dec counterd + jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end +%if ARCH_X86_32 + mov tmpm, tmpd + mov r0, [esp+0x100] + mov r1, [esp+0x104] +%endif + call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2 + lea tmpq, [tmpq+tsq*4] + jmp .loop +%endmacro + +%macro WARP_AFFINE_8X8 0 +%if ARCH_X86_64 +cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \ + dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ + filter, tmp1, delta, my, gamma +%else +cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \ + dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ + filter, tmp1, delta, my, gamma + %define alphaq r0 + %define alphad r0 + %define alpham [esp+gprsize+0x100] + %define betaq r1 + %define betad r1 + %define betam [esp+gprsize+0x104] + %define deltaq r0 + %define deltad r0 + %define deltam [esp+gprsize+0x108] + %define gammaq r1 + %define gammad r1 + %define gammam [esp+gprsize+0x10C] + %define filterq r3 + %define tmp1q r4 + %define tmp1d r4 + %define tmp1m [esp+gprsize+0x110] + %define myq r5 + %define myd r5 + %define mym r6m + %if copy_args + %define dstm [esp+stack_size-4*1] + %define dsm [esp+stack_size-4*2] + %define srcm [esp+stack_size-4*3] + %define ssm [esp+stack_size-4*4] + %define mxm [esp+stack_size-4*5] + %define mym [esp+stack_size-4*6] + %endif +%endif + call .main + jmp .start +.loop: +%if ARCH_X86_32 + mov dstm, dstd + mov alphad, [esp+0x100] + mov betad, [esp+0x104] +%endif + call .main2 + lea dstq, [dstq+dsq*2] +.start: +%if notcpuflag(sse4) + %if cpuflag(ssse3) + %define roundval pw_8192 + %else + %define roundval pd_262144 + %endif + %if ARCH_X86_64 + mova m10, [PIC_sym(roundval)] + %else + %define m10 [PIC_sym(roundval)] + %endif +%endif +%if ARCH_X86_32 + %define m12 m5 + %define m13 m6 + mova m12, [esp+0xC0] + mova m13, [esp+0xD0] +%endif +%if cpuflag(sse4) + %if ARCH_X86_32 + %define m11 m4 + pxor m11, m11 + %endif + psrad m12, 18 + psrad m13, 18 + packusdw m12, m13 + pavgw m12, m11 ; (x + (1 << 10)) >> 11 +%else + %if cpuflag(ssse3) + psrad m12, 17 + psrad m13, 17 + packssdw m12, m13 + pmulhrsw m12, m10 + %else + paddd m12, m10 + paddd m13, m10 + psrad m12, 19 + psrad m13, 19 + packssdw m12, m13 + %endif +%endif +%if ARCH_X86_32 + %define m14 m6 + %define m15 m7 + mova m14, [esp+0xE0] + mova m15, [esp+0xF0] +%endif +%if cpuflag(sse4) + psrad m14, 18 + psrad m15, 18 + packusdw m14, m15 + pavgw m14, m11 ; (x + (1 << 10)) >> 11 +%else + %if cpuflag(ssse3) + psrad m14, 17 + psrad m15, 17 + packssdw m14, m15 + pmulhrsw m14, m10 + %else + paddd m14, m10 + paddd m15, m10 + psrad m14, 19 + psrad m15, 19 + packssdw m14, m15 + %endif +%endif + packuswb m12, m14 + movq [dstq+dsq*0], m12 + movhps [dstq+dsq*1], m12 + dec counterd + jg .loop +.end: + RET +ALIGN function_align +.main: +%assign stack_offset stack_offset+gprsize +%if ARCH_X86_32 + %assign stack_size stack_size+4 + %if copy_args + %assign stack_offset stack_offset-4 + %endif + RELOC_ARGS + LEA PIC_reg, $$ + %define PIC_mem [esp+gprsize+0x114] + mov abcdd, abcdm + %if copy_args == 0 + mov ssd, ssm + mov mxd, mxm + %endif + mov PIC_mem, PIC_reg + mov srcd, srcm +%endif + movsx deltad, word [abcdq+2*2] + movsx gammad, word [abcdq+2*3] + lea tmp1d, [deltaq*3] + sub gammad, tmp1d ; gamma -= delta*3 + SAVE_DELTA_GAMMA +%if ARCH_X86_32 + mov abcdd, abcdm +%endif + movsx alphad, word [abcdq+2*0] + movsx betad, word [abcdq+2*1] + lea tmp1q, [ssq*3+3] + add mxd, 512+(64<<10) + lea tmp2d, [alphaq*3] + sub srcq, tmp1q ; src -= src_stride*3 + 3 +%if ARCH_X86_32 + mov srcm, srcd + mov PIC_reg, PIC_mem +%endif + sub betad, tmp2d ; beta -= alpha*3 + lea filterq, [PIC_sym(mc_warp_filter2)] +%if ARCH_X86_64 + mov myd, r6m + %if cpuflag(ssse3) + pxor m11, m11 + %endif +%endif + call .h + psrld m2, m0, 16 + psrld m3, m1, 16 +%if ARCH_X86_32 + %if notcpuflag(ssse3) + mova [esp+gprsize+0x00], m2 + %endif + mova [esp+gprsize+0x10], m3 +%endif + call .h + psrld m4, m0, 16 + psrld m5, m1, 16 +%if ARCH_X86_32 + mova [esp+gprsize+0x20], m4 + mova [esp+gprsize+0x30], m5 +%endif + call .h +%if ARCH_X86_64 + %define blendmask [rsp+gprsize+0x80] +%else + %if notcpuflag(ssse3) + mova m2, [esp+gprsize+0x00] + %endif + mova m3, [esp+gprsize+0x10] + %define blendmask [esp+gprsize+0x120] + %define m10 m7 +%endif + pcmpeqd m10, m10 + pslld m10, 16 + mova blendmask, m10 + BLENDHWDW m2, m0 ; 0 + BLENDHWDW m3, m1 ; 2 + mova [rsp+gprsize+0x00], m2 + mova [rsp+gprsize+0x10], m3 + call .h +%if ARCH_X86_32 + mova m4, [esp+gprsize+0x20] + mova m5, [esp+gprsize+0x30] +%endif + mova m10, blendmask + BLENDHWDW m4, m0 ; 1 + BLENDHWDW m5, m1 ; 3 + mova [rsp+gprsize+0x20], m4 + mova [rsp+gprsize+0x30], m5 + call .h +%if ARCH_X86_32 + %if notcpuflag(ssse3) + mova m2, [esp+gprsize+0x00] + %endif + mova m3, [esp+gprsize+0x10] + %define m10 m5 +%endif + psrld m6, m2, 16 + psrld m7, m3, 16 + mova m10, blendmask + BLENDHWDW m6, m0 ; 2 + BLENDHWDW m7, m1 ; 4 + mova [rsp+gprsize+0x40], m6 + mova [rsp+gprsize+0x50], m7 + call .h +%if ARCH_X86_32 + mova m4, [esp+gprsize+0x20] + mova m5, [esp+gprsize+0x30] +%endif + psrld m2, m4, 16 + psrld m3, m5, 16 + mova m10, blendmask + BLENDHWDW m2, m0 ; 3 + BLENDHWDW m3, m1 ; 5 + mova [rsp+gprsize+0x60], m2 + mova [rsp+gprsize+0x70], m3 + call .h +%if ARCH_X86_32 + mova m6, [esp+gprsize+0x40] + mova m7, [esp+gprsize+0x50] + %define m10 m7 +%endif + psrld m4, m6, 16 + psrld m5, m7, 16 + mova m10, blendmask + BLENDHWDW m4, m0 ; 4 + BLENDHWDW m5, m1 ; 6 +%if ARCH_X86_64 + add myd, 512+(64<<10) + mova m6, m2 + mova m7, m3 +%else + mova [esp+gprsize+0x80], m4 + mova [esp+gprsize+0x90], m5 + add dword mym, 512+(64<<10) +%endif + mov counterd, 4 + SAVE_ALPHA_BETA +.main2: + call .h +%if ARCH_X86_32 + mova m6, [esp+gprsize+0x60] + mova m7, [esp+gprsize+0x70] + %define m10 m5 +%endif + psrld m6, 16 + psrld m7, 16 + mova m10, blendmask + BLENDHWDW m6, m0 ; 5 + BLENDHWDW m7, m1 ; 7 +%if ARCH_X86_64 + WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ + m4, m5, \ + [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ + m6, m7 +%else + mova [esp+gprsize+0xA0], m6 + mova [esp+gprsize+0xB0], m7 + LOAD_DELTA_GAMMA_MY + WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \ + [esp+gprsize+0x00], [esp+gprsize+0x10], \ + [esp+gprsize+0x80], [esp+gprsize+0x90], \ + [esp+gprsize+0x20], [esp+gprsize+0x30], \ + [esp+gprsize+0xA0], [esp+gprsize+0xB0] + LOAD_ALPHA_BETA_MX +%endif + call .h + mova m2, [rsp+gprsize+0x40] + mova m3, [rsp+gprsize+0x50] +%if ARCH_X86_32 + mova m4, [rsp+gprsize+0x80] + mova m5, [rsp+gprsize+0x90] + %define m10 m7 +%endif + mova [rsp+gprsize+0x00], m2 + mova [rsp+gprsize+0x10], m3 + mova [rsp+gprsize+0x40], m4 + mova [rsp+gprsize+0x50], m5 + psrld m4, 16 + psrld m5, 16 + mova m10, blendmask + BLENDHWDW m4, m0 ; 6 + BLENDHWDW m5, m1 ; 8 +%if ARCH_X86_64 + WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ + m6, m7, \ + [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ + m4, m5 +%else + mova [esp+gprsize+0x80], m4 + mova [esp+gprsize+0x90], m5 + LOAD_DELTA_GAMMA_MY + WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \ + [esp+gprsize+0x20], [esp+gprsize+0x30], \ + [esp+gprsize+0xA0], [esp+gprsize+0xB0], \ + [esp+gprsize+0x00], [esp+gprsize+0x10], \ + [esp+gprsize+0x80], [esp+gprsize+0x90] + mov mym, myd + mov dstd, dstm + mov dsd, dsm + mov mxd, mxm +%endif + mova m2, [rsp+gprsize+0x60] + mova m3, [rsp+gprsize+0x70] +%if ARCH_X86_32 + mova m6, [esp+gprsize+0xA0] + mova m7, [esp+gprsize+0xB0] +%endif + mova [rsp+gprsize+0x20], m2 + mova [rsp+gprsize+0x30], m3 + mova [rsp+gprsize+0x60], m6 + mova [rsp+gprsize+0x70], m7 + ret +ALIGN function_align +.h: +%if ARCH_X86_32 + %define m8 m3 + %define m9 m4 + %define m10 m5 + %define m14 m6 + %define m15 m7 +%endif + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] +%if ARCH_X86_32 + %assign stack_offset stack_offset+4 + %assign stack_size stack_size+4 + %define PIC_mem [esp+gprsize*2+0x114] + mov PIC_mem, PIC_reg + mov srcd, srcm +%endif + movu m10, [srcq] +%if ARCH_X86_32 + add srcd, ssm + mov srcm, srcd + mov PIC_reg, PIC_mem +%else + add srcq, ssq +%endif + shr mxd, 10 + shr tmp1d, 10 + movq m1, [filterq+mxq *8] ; 0 X + movq m8, [filterq+tmp1q*8] ; 4 X + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+alphaq*1] + shr tmp2d, 10 + shr tmp1d, 10 + movhps m1, [filterq+tmp2q*8] ; 0 1 + movhps m8, [filterq+tmp1q*8] ; 4 5 + lea tmp1d, [mxq+alphaq*4] + lea tmp2d, [mxq+alphaq*1] + shr mxd, 10 + shr tmp1d, 10 +%if cpuflag(ssse3) + movq m14, [filterq+mxq *8] ; 2 X + movq m9, [filterq+tmp1q*8] ; 6 X + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+betaq] ; mx += beta + shr tmp2d, 10 + shr tmp1d, 10 + movhps m14, [filterq+tmp2q*8] ; 2 3 + movhps m9, [filterq+tmp1q*8] ; 6 7 + pshufb m0, m10, [PIC_sym(warp_8x8_shufA)] + pmaddubsw m0, m1 + pshufb m1, m10, [PIC_sym(warp_8x8_shufB)] + pmaddubsw m1, m8 + pshufb m15, m10, [PIC_sym(warp_8x8_shufC)] + pmaddubsw m15, m14 + pshufb m10, m10, [PIC_sym(warp_8x8_shufD)] + pmaddubsw m10, m9 + phaddw m0, m15 + phaddw m1, m10 +%else + %if ARCH_X86_32 + %define m11 m2 + %endif + pcmpeqw m0, m0 + psrlw m14, m0, 8 + psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15 + pand m14, m10 ; 00 02 04 06 08 10 12 14 + packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 + psrldq m9, m0, 4 + pshufd m0, m14, q0220 + pand m0, m9 + psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ + pslldq m15, m14, 12 + por m0, m15 ; shufA + psrlw m15, m0, 8 + psraw m11, m1, 8 + psllw m0, 8 + psllw m1, 8 + psrlw m0, 8 + psraw m1, 8 + pmullw m15, m11 + pmullw m0, m1 + paddw m0, m15 ; pmaddubsw m0, m1 + pshufd m15, m14, q0220 + pand m15, m9 + psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ + pslldq m1, m14, 12 + por m15, m1 ; shufC + pshufd m1, m14, q0220 + pand m1, m9 + psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ + pslldq m11, m14, 12 + por m1, m11 ; shufB + pshufd m10, m14, q0220 + pand m10, m9 + psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __ + pslldq m14, m14, 12 + por m10, m14 ; shufD + psrlw m9, m1, 8 + psraw m11, m8, 8 + psllw m1, 8 + psllw m8, 8 + psrlw m1, 8 + psraw m8, 8 + pmullw m9, m11 + pmullw m1, m8 + paddw m1, m9 ; pmaddubsw m1, m8 + movq m14, [filterq+mxq *8] ; 2 X + movq m9, [filterq+tmp1q*8] ; 6 X + lea tmp1d, [tmp2q+alphaq*4] + lea mxd, [tmp2q+betaq] ; mx += beta + shr tmp2d, 10 + shr tmp1d, 10 + movhps m14, [filterq+tmp2q*8] ; 2 3 + movhps m9, [filterq+tmp1q*8] ; 6 7 + psrlw m8, m15, 8 + psraw m11, m14, 8 + psllw m15, 8 + psllw m14, 8 + psrlw m15, 8 + psraw m14, 8 + pmullw m8, m11 + pmullw m15, m14 + paddw m15, m8 ; pmaddubsw m15, m14 + psrlw m8, m10, 8 + psraw m11, m9, 8 + psllw m10, 8 + psllw m9, 8 + psrlw m10, 8 + psraw m9, 8 + pmullw m8, m11 + pmullw m10, m9 + paddw m10, m8 ; pmaddubsw m10, m9 + pslld m8, m0, 16 + pslld m9, m1, 16 + pslld m14, m15, 16 + pslld m11, m10, 16 + paddw m0, m8 + paddw m1, m9 + paddw m15, m14 + paddw m10, m11 + psrad m0, 16 + psrad m1, 16 + psrad m15, 16 + psrad m10, 16 + packssdw m0, m15 ; phaddw m0, m15 + packssdw m1, m10 ; phaddw m1, m10 +%endif + mova m14, [PIC_sym(pw_8192)] + mova m9, [PIC_sym(pd_32768)] + pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 + pmaddwd m1, m14 + paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword + paddd m1, m9 + ret +%endmacro + +%if WIN64 +DECLARE_REG_TMP 6, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif + +%macro BIDIR_FN 1 ; op + %1 0 + lea stride3q, [strideq*3] + jmp wq +.w4_loop: + %1_INC_PTR 2 + %1 0 + lea dstq, [dstq+strideq*4] +.w4: ; tile 4x + movd [dstq ], m0 ; copy dw[0] + pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] + movd [dstq+strideq*1], m1 ; copy dw[1] + punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] + movd [dstq+strideq*2], m0 ; dw[2] + psrlq m0, 32 ; shift right in dw[3] + movd [dstq+stride3q ], m0 ; copy + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + %1_INC_PTR 2 + %1 0 + lea dstq, [dstq+strideq*2] +.w8: + movq [dstq ], m0 + movhps [dstq+strideq*1], m0 + sub hd, 2 + jg .w8_loop + RET +.w16_loop: + %1_INC_PTR 2 + %1 0 + lea dstq, [dstq+strideq] +.w16: + mova [dstq ], m0 + dec hd + jg .w16_loop + RET +.w32_loop: + %1_INC_PTR 4 + %1 0 + lea dstq, [dstq+strideq] +.w32: + mova [dstq ], m0 + %1 2 + mova [dstq + 16 ], m0 + dec hd + jg .w32_loop + RET +.w64_loop: + %1_INC_PTR 8 + %1 0 + add dstq, strideq +.w64: + %assign i 0 + %rep 4 + mova [dstq + i*16 ], m0 + %assign i i+1 + %if i < 4 + %1 2*i + %endif + %endrep + dec hd + jg .w64_loop + RET +.w128_loop: + %1_INC_PTR 16 + %1 0 + add dstq, strideq +.w128: + %assign i 0 + %rep 8 + mova [dstq + i*16 ], m0 + %assign i i+1 + %if i < 8 + %1 2*i + %endif + %endrep + dec hd + jg .w128_loop + RET +%endmacro + +%macro AVG 1 ; src_offset + ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel + mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 + paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 + mova m1, [tmp1q+(%1+1)*mmsize] + paddw m1, [tmp2q+(%1+1)*mmsize] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit +%endmacro + +%macro AVG_INC_PTR 1 + add tmp1q, %1*mmsize + add tmp2q, %1*mmsize +%endmacro + +cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 + LEA r6, avg_ssse3_table + tzcnt wd, wm ; leading zeros + movifnidn hd, hm ; move h(stack) to h(register) if not already that register + movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg + mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align + add wq, r6 + BIDIR_FN AVG + +%macro W_AVG 1 ; src_offset + ; (a * weight + b * (16 - weight) + 128) >> 8 + ; = ((a - b) * weight + (b << 4) + 128) >> 8 + ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 + ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 + mova m2, [tmp1q+(%1+0)*mmsize] + mova m0, m2 + psubw m2, [tmp2q+(%1+0)*mmsize] + mova m3, [tmp1q+(%1+1)*mmsize] + mova m1, m3 + psubw m3, [tmp2q+(%1+1)*mmsize] + pmulhw m2, m4 + pmulhw m3, m4 + paddw m0, m2 + paddw m1, m3 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + packuswb m0, m1 +%endmacro + +%define W_AVG_INC_PTR AVG_INC_PTR + +cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 + LEA r6, w_avg_ssse3_table + tzcnt wd, wm + movd m4, r6m + movifnidn hd, hm + pxor m0, m0 + movsxd wq, dword [r6+wq*4] + mova m5, [pw_2048+r6-w_avg_ssse3_table] + pshufb m4, m0 + psllw m4, 12 ; (weight-16) << 12 when interpreted as signed + add wq, r6 + cmp dword r6m, 7 + jg .weight_gt7 + mov r6, tmp1q + psubw m0, m4 + mov tmp1q, tmp2q + mova m4, m0 ; -weight + mov tmp2q, r6 +.weight_gt7: + BIDIR_FN W_AVG + +%macro MASK 1 ; src_offset + ; (a * m + b * (64 - m) + 512) >> 10 + ; = ((a - b) * m + (b << 6) + 512) >> 10 + ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 + mova m3, [maskq+(%1+0)*(mmsize/2)] + mova m0, [tmp2q+(%1+0)*mmsize] ; b + psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a + mova m6, m3 ; m + psubb m3, m4, m6 ; -m + paddw m1, m1 ; (b - a) << 1 + paddb m3, m3 ; -m << 1 + punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16) + pmulhw m1, m2 ; (-m * (b - a)) << 10 + paddw m0, m1 ; + b + mova m1, [tmp2q+(%1+1)*mmsize] ; b + psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a + paddw m2, m2 ; (b - a) << 1 + mova m6, m3 ; (-m << 1) + punpckhbw m3, m4, m6 ; (-m << 9) + pmulhw m2, m3 ; (-m << 9) + paddw m1, m2 ; (-m * (b - a)) << 10 + pmulhrsw m0, m5 ; round + pmulhrsw m1, m5 ; round + packuswb m0, m1 ; interleave 16 -> 8 +%endmacro + +%macro MASK_INC_PTR 1 + add maskq, %1*mmsize/2 + add tmp1q, %1*mmsize + add tmp2q, %1*mmsize +%endmacro + +%if ARCH_X86_64 +cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 + movifnidn hd, hm +%else +cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 +%define hd dword r5m +%endif +%define base r6-mask_ssse3_table + LEA r6, mask_ssse3_table + tzcnt wd, wm + movsxd wq, dword [r6+wq*4] + pxor m4, m4 + mova m5, [base+pw_2048] + add wq, r6 + mov maskq, r6m + BIDIR_FN MASK +%undef hd + +%macro W_MASK_420_END 1-* +%rep %0 + call .main + paddw m2, [maskq+16*%1] + mova [maskq+16*%1], m2 + mova [dstq+strideq*1+16*(2*%1+0)], m0 + call .main + psubw m3, m7, m2 + psubw m1, m7, [maskq+16*%1] + psubw m3, [dstq+strideq*1+16*(2*%1+1)] + psrlw m1, 2 + psrlw m3, 2 + packuswb m1, m3 + mova [maskq+16*%1], m1 + mova [dstq+strideq*1+16*(2*%1+1)], m0 + %rotate 1 +%endrep +%endmacro + +%if UNIX64 +DECLARE_REG_TMP 7 +%else +DECLARE_REG_TMP 5 +%endif + +cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_420_ssse3_table + LEA t0, w_mask_420_ssse3_table + tzcnt wd, wm + mov r6d, r7m ; sign + sub tmp2q, tmp1q + movsxd wq, [t0+wq*4] + mova m6, [base+pw_2048] + movddup m7, [base+wm_420_sign+r6*8] ; 258 - sign + add wq, t0 +%if ARCH_X86_64 + mova m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 + movifnidn hd, hm +%else + %define m8 [base+pw_6903] + %define hd dword hm +%endif + mov maskq, maskmp + call .main + jmp wq +.w4_loop: + call .main + add maskq, 4 + lea dstq, [dstq+strideq*2] +.w4: + pshufd m3, m2, q2020 + pshufd m2, m2, q3131 + psubw m1, m7, m3 + psubw m1, m2 + psrlw m1, 2 + packuswb m1, m1 + movd [maskq], m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m0, m0 + lea dstq, [dstq+strideq*2] + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + call .main + add maskq, 4 + lea dstq, [dstq+strideq*2] +.w8: + movhlps m3, m2 + psubw m1, m7, m2 + psubw m1, m3 + psrlw m1, 2 + packuswb m1, m1 + movd [maskq], m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + sub hd, 2 + jg .w8_loop + RET +.w16_loop: + call .main + add maskq, 8 + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*1], m2 + mova [dstq+strideq*0], m0 + call .main + psubw m1, m7, [dstq+strideq*1] + psubw m1, m2 + psrlw m1, 2 + packuswb m1, m1 + movq [maskq], m1 + mova [dstq+strideq*1], m0 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add maskq, 16 + lea dstq, [dstq+strideq*2] +.w32: + mova [maskq], m2 + mova [dstq+strideq*0+16*0], m0 + call .main + mova [dstq+strideq*1+16*1], m2 + mova [dstq+strideq*0+16*1], m0 + W_MASK_420_END 0 + sub hd, 2 + jg .w32_loop + RET +.w64_loop: + call .main + add maskq, 16*2 + lea dstq, [dstq+strideq*2] +.w64: + mova [maskq+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + call .main + mova [dstq+strideq*1+16*1], m2 + mova [dstq+strideq*0+16*1], m0 + call .main + mova [maskq+16*1], m2 + mova [dstq+strideq*0+16*2], m0 + call .main + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*3], m0 + W_MASK_420_END 0, 1 + sub hd, 2 + jg .w64_loop + RET +.w128_loop: + call .main + add maskq, 16*4 + lea dstq, [dstq+strideq*2] +.w128: + mova [maskq+16*0], m2 + mova [dstq+strideq*0+16*0], m0 + call .main + mova [dstq+strideq*1+16*1], m2 + mova [dstq+strideq*0+16*1], m0 + call .main + mova [maskq+16*1], m2 + mova [dstq+strideq*0+16*2], m0 + call .main + mova [dstq+strideq*1+16*3], m2 + mova [dstq+strideq*0+16*3], m0 + call .main + mova [maskq+16*2], m2 + mova [dstq+strideq*0+16*4], m0 + call .main + mova [dstq+strideq*1+16*5], m2 + mova [dstq+strideq*0+16*5], m0 + call .main + mova [maskq+16*3], m2 + mova [dstq+strideq*0+16*6], m0 + call .main + mova [dstq+strideq*1+16*7], m2 + mova [dstq+strideq*0+16*7], m0 + W_MASK_420_END 0, 1, 2, 3 + sub hd, 2 + jg .w128_loop + RET +ALIGN function_align +.main: + mova m0, [tmp1q +16*0] + mova m3, [tmp1q+tmp2q+16*0] + mova m1, [tmp1q +16*1] + mova m4, [tmp1q+tmp2q+16*1] + add tmp1q, 16*2 + psubw m3, m0 + psubw m4, m1 + pabsw m5, m3 + psubusw m2, m8, m5 + psrlw m2, 8 ; 64 - m + psllw m5, m2, 10 + pmulhw m3, m5 + pabsw m5, m4 + paddw m0, m3 + psubusw m3, m8, m5 + psrlw m3, 8 + phaddw m2, m3 + psllw m3, 10 + pmulhw m4, m3 + paddw m1, m4 + pmulhrsw m0, m6 + pmulhrsw m1, m6 + packuswb m0, m1 + ret + +%macro W_MASK_422_BACKUP 1 ; mask_offset +%if ARCH_X86_64 + mova m10, m2 +%else + mova [maskq+16*%1], m2 +%endif +%endmacro + +%macro W_MASK_422_END 1 ; mask_offset +%if ARCH_X86_64 + packuswb m10, m2 + psubb m1, m7, m10 + pavgb m1, m9 +%else + mova m3, [maskq+16*%1] + packuswb m3, m2 + pxor m2, m2 + psubb m1, m7, m3 + pavgb m1, m2 +%endif + mova [maskq+16*%1], m1 +%endmacro + +cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_422_ssse3_table + LEA t0, w_mask_422_ssse3_table + tzcnt wd, wm + mov r6d, r7m ; sign + sub tmp2q, tmp1q + movsxd wq, [t0+wq*4] + mova m6, [base+pw_2048] + movddup m7, [base+wm_422_sign+r6*8] ; 128 - sign + add wq, t0 +%if ARCH_X86_64 + mova m8, [base+pw_6903] + pxor m9, m9 + movifnidn hd, hm +%else + add t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table + %define hd dword hm +%endif + mov maskq, maskmp + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + jmp wq +.w4_loop: + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + add maskq, 8 + lea dstq, [dstq+strideq*2] +.w4: + packuswb m2, m2 + psubb m1, m7, m2 +%if ARCH_X86_64 + pavgb m1, m9 +%else + pxor m2, m2 + pavgb m1, m2 +%endif + movq [maskq], m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m0, m0 + lea dstq, [dstq+strideq*2] + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + add maskq, 16 + lea dstq, [dstq+strideq*2] +.w8: + W_MASK_422_BACKUP 0 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + lea dstq, [dstq+strideq*2] + W_MASK_422_END 0 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + sub hd, 4 + jg .w8_loop + RET +.w16_loop: + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + add maskq, 16 + lea dstq, [dstq+strideq*2] +.w16: + W_MASK_422_BACKUP 0 + mova [dstq+strideq*0], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_END 0 + mova [dstq+strideq*1], m0 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + add maskq, 16 + add dstq, strideq +.w32: + W_MASK_422_BACKUP 0 + mova [dstq+16*0], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_END 0 + mova [dstq+16*1], m0 + dec hd + jg .w32_loop + RET +.w64_loop: + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + add maskq, 16*2 + add dstq, strideq +.w64: + W_MASK_422_BACKUP 0 + mova [dstq+16*0], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_END 0 + mova [dstq+16*1], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_BACKUP 1 + mova [dstq+16*2], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_END 1 + mova [dstq+16*3], m0 + dec hd + jg .w64_loop + RET +.w128_loop: + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + add maskq, 16*4 + add dstq, strideq +.w128: + W_MASK_422_BACKUP 0 + mova [dstq+16*0], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_END 0 + mova [dstq+16*1], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_BACKUP 1 + mova [dstq+16*2], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_END 1 + mova [dstq+16*3], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_BACKUP 2 + mova [dstq+16*4], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_END 2 + mova [dstq+16*5], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_BACKUP 3 + mova [dstq+16*6], m0 + call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main + W_MASK_422_END 3 + mova [dstq+16*7], m0 + dec hd + jg .w128_loop + RET + +cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask +%define base t0-w_mask_444_ssse3_table + LEA t0, w_mask_444_ssse3_table + tzcnt wd, wm + mov maskq, maskmp + sub tmp2q, tmp1q + movsxd wq, [t0+wq*4] + mova m6, [base+pw_6903] + mova m7, [base+pw_2048] + add wq, t0 +%if ARCH_X86_64 + mova m8, [base+pb_64] + movifnidn hd, hm +%else + %define m8 [base+pb_64] + %define hd dword hm +%endif + call .main + jmp wq +.w4_loop: + call .main + lea dstq, [dstq+strideq*2] +.w4: + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + punpckhqdq m0, m0 + lea dstq, [dstq+strideq*2] + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + sub hd, 4 + jg .w4_loop + RET +.w8_loop: + call .main + lea dstq, [dstq+strideq*2] +.w8: + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + sub hd, 2 + jg .w8_loop + RET +.w16_loop: + call .main + lea dstq, [dstq+strideq*2] +.w16: + mova [dstq+strideq*0], m0 + call .main + mova [dstq+strideq*1], m0 + sub hd, 2 + jg .w16_loop + RET +.w32_loop: + call .main + add dstq, strideq +.w32: + mova [dstq+16*0], m0 + call .main + mova [dstq+16*1], m0 + dec hd + jg .w32_loop + RET +.w64_loop: + call .main + add dstq, strideq +.w64: + mova [dstq+16*0], m0 + call .main + mova [dstq+16*1], m0 + call .main + mova [dstq+16*2], m0 + call .main + mova [dstq+16*3], m0 + dec hd + jg .w64_loop + RET +.w128_loop: + call .main + add dstq, strideq +.w128: + mova [dstq+16*0], m0 + call .main + mova [dstq+16*1], m0 + call .main + mova [dstq+16*2], m0 + call .main + mova [dstq+16*3], m0 + call .main + mova [dstq+16*4], m0 + call .main + mova [dstq+16*5], m0 + call .main + mova [dstq+16*6], m0 + call .main + mova [dstq+16*7], m0 + dec hd + jg .w128_loop + RET +ALIGN function_align +.main: + mova m0, [tmp1q +16*0] + mova m3, [tmp1q+tmp2q+16*0] + mova m1, [tmp1q +16*1] + mova m4, [tmp1q+tmp2q+16*1] + add tmp1q, 16*2 + psubw m3, m0 + psubw m4, m1 + pabsw m5, m3 + psubusw m2, m6, m5 + psrlw m2, 8 ; 64 - m + psllw m5, m2, 10 + pmulhw m3, m5 + pabsw m5, m4 + paddw m0, m3 + psubusw m3, m6, m5 + psrlw m3, 8 + packuswb m2, m3 + psllw m3, 10 + pmulhw m4, m3 + psubb m3, m8, m2 + paddw m1, m4 + pmulhrsw m0, m7 + pmulhrsw m1, m7 + mova [maskq], m3 + add maskq, 16 + packuswb m0, m1 + ret + +%macro BLEND_64M 4; a, b, mask1, mask2 + punpcklbw m0, %1, %2; {b;a}[7..0] + punpckhbw %1, %2 ; {b;a}[15..8] + pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16 + pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16 + pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 + pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16 + packuswb m0, %1 ; {blendpx}[15..0] u8 +%endmacro + +%macro BLEND 2; a, b + psubb m3, m4, m0 ; m3 = (64 - m) + punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] + punpckhbw m3, m0 ; {m;(64-m)}[15..8] + BLEND_64M %1, %2, m2, m3 +%endmacro + +cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask +%define base r6-blend_ssse3_table + LEA r6, blend_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movifnidn maskq, maskmp + movsxd wq, dword [r6+wq*4] + mova m4, [base+pb_64] + mova m5, [base+pw_512] + add wq, r6 + lea r6, [dsq*3] + jmp wq +.w4: + movq m0, [maskq]; m + movd m1, [dstq+dsq*0] ; a + movd m6, [dstq+dsq*1] + punpckldq m1, m6 + movq m6, [tmpq] ; b + psubb m3, m4, m0 ; m3 = (64 - m) + punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] + punpcklbw m1, m6 ; {b;a}[7..0] + pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16 + pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 + packuswb m1, m0 ; {blendpx}[15..0] u8 + movd [dstq+dsq*0], m1 + psrlq m1, 32 + movd [dstq+dsq*1], m1 + add maskq, 8 + add tmpq, 8 + lea dstq, [dstq+dsq*2] ; dst_stride * 2 + sub hd, 2 + jg .w4 + RET +.w8: + mova m0, [maskq]; m + movq m1, [dstq+dsq*0] ; a + movhps m1, [dstq+dsq*1] + mova m6, [tmpq] ; b + BLEND m1, m6 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + add maskq, 16 + add tmpq, 16 + lea dstq, [dstq+dsq*2] ; dst_stride * 2 + sub hd, 2 + jg .w8 + RET +.w16: + mova m0, [maskq]; m + mova m1, [dstq] ; a + mova m6, [tmpq] ; b + BLEND m1, m6 + mova [dstq], m0 + add maskq, 16 + add tmpq, 16 + add dstq, dsq ; dst_stride + dec hd + jg .w16 + RET +.w32: + %assign i 0 + %rep 2 + mova m0, [maskq+16*i]; m + mova m1, [dstq+16*i] ; a + mova m6, [tmpq+16*i] ; b + BLEND m1, m6 + mova [dstq+i*16], m0 + %assign i i+1 + %endrep + add maskq, 32 + add tmpq, 32 + add dstq, dsq ; dst_stride + dec hd + jg .w32 + RET + +cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask +%define base r5-blend_v_ssse3_table + LEA r5, blend_v_ssse3_table + tzcnt wd, wm + movifnidn hd, hm + movsxd wq, dword [r5+wq*4] + mova m5, [base+pw_512] + add wq, r5 + add maskq, obmc_masks-blend_v_ssse3_table + jmp wq +.w2: + movd m3, [maskq+4] + punpckldq m3, m3 + ; 2 mask blend is provided for 4 pixels / 2 lines +.w2_loop: + movd m1, [dstq+dsq*0] ; a {..;a;a} + pinsrw m1, [dstq+dsq*1], 1 + movd m2, [tmpq] ; b + punpcklbw m0, m1, m2; {b;a}[7..0] + pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16 + pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 + packuswb m0, m1 ; {blendpx}[8..0] u8 + movd r3d, m0 + mov [dstq+dsq*0], r3w + shr r3d, 16 + mov [dstq+dsq*1], r3w + add tmpq, 2*2 + lea dstq, [dstq + dsq * 2] + sub hd, 2 + jg .w2_loop + RET +.w4: + movddup m3, [maskq+8] + ; 4 mask blend is provided for 8 pixels / 2 lines +.w4_loop: + movd m1, [dstq+dsq*0] ; a + movd m2, [dstq+dsq*1] ; + punpckldq m1, m2 + movq m2, [tmpq] ; b + punpcklbw m1, m2 ; {b;a}[7..0] + pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16 + pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 + packuswb m1, m1 ; {blendpx}[8..0] u8 + movd [dstq], m1 + psrlq m1, 32 + movd [dstq+dsq*1], m1 + add tmpq, 2*4 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w4_loop + RET +.w8: + mova m3, [maskq+16] + ; 8 mask blend is provided for 16 pixels +.w8_loop: + movq m1, [dstq+dsq*0] ; a + movhps m1, [dstq+dsq*1] + mova m2, [tmpq]; b + BLEND_64M m1, m2, m3, m3 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + add tmpq, 16 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .w8_loop + RET +.w16: + ; 16 mask blend is provided for 32 pixels + mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0]) + mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1]) +.w16_loop: + mova m1, [dstq] ; a + mova m2, [tmpq] ; b + BLEND_64M m1, m2, m3, m4 + mova [dstq], m0 + add tmpq, 16 + add dstq, dsq + dec hd + jg .w16_loop + RET +.w32: +%if WIN64 + mova [rsp+8], xmm6 +%endif + mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0]) + mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1]) + mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2]) + ; 16 mask blend is provided for 64 pixels +.w32_loop: + mova m1, [dstq+16*0] ; a + mova m2, [tmpq+16*0] ; b + BLEND_64M m1, m2, m3, m4 + movq m1, [dstq+16*1] ; a + punpcklbw m1, [tmpq+16*1] ; b + pmaddubsw m1, m6 + pmulhrsw m1, m5 + packuswb m1, m1 + mova [dstq+16*0], m0 + movq [dstq+16*1], m1 + add tmpq, 32 + add dstq, dsq + dec hd + jg .w32_loop +%if WIN64 + mova xmm6, [rsp+8] +%endif + RET + +cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask +%define base t0-blend_h_ssse3_table +%if ARCH_X86_32 + ; We need to keep the PIC pointer for w4, reload wd from stack instead + DECLARE_REG_TMP 6 +%else + DECLARE_REG_TMP 5 + mov r6d, wd +%endif + LEA t0, blend_h_ssse3_table + tzcnt wd, wm + mov hd, hm + movsxd wq, dword [t0+wq*4] + mova m5, [base+pw_512] + add wq, t0 + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] + neg hq + jmp wq +.w2: + movd m0, [dstq+dsq*0] + pinsrw m0, [dstq+dsq*1], 1 + movd m2, [maskq+hq*2] + movd m1, [tmpq] + punpcklwd m2, m2 + punpcklbw m0, m1 + pmaddubsw m0, m2 + pmulhrsw m0, m5 + packuswb m0, m0 + movd r3d, m0 + mov [dstq+dsq*0], r3w + shr r3d, 16 + mov [dstq+dsq*1], r3w + lea dstq, [dstq+dsq*2] + add tmpq, 2*2 + add hq, 2 + jl .w2 + RET +.w4: +%if ARCH_X86_32 + mova m3, [base+blend_shuf] +%else + mova m3, [blend_shuf] +%endif +.w4_loop: + movd m0, [dstq+dsq*0] + movd m2, [dstq+dsq*1] + punpckldq m0, m2 ; a + movq m1, [tmpq] ; b + movq m2, [maskq+hq*2] ; m + pshufb m2, m3 + punpcklbw m0, m1 + pmaddubsw m0, m2 + pmulhrsw m0, m5 + packuswb m0, m0 + movd [dstq+dsq*0], m0 + psrlq m0, 32 + movd [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add tmpq, 4*2 + add hq, 2 + jl .w4_loop + RET +.w8: + movd m4, [maskq+hq*2] + punpcklwd m4, m4 + pshufd m3, m4, q0000 + pshufd m4, m4, q1111 + movq m1, [dstq+dsq*0] ; a + movhps m1, [dstq+dsq*1] + mova m2, [tmpq] + BLEND_64M m1, m2, m3, m4 + movq [dstq+dsq*0], m0 + movhps [dstq+dsq*1], m0 + lea dstq, [dstq+dsq*2] + add tmpq, 8*2 + add hq, 2 + jl .w8 + RET +; w16/w32/w64/w128 +.w16: +%if ARCH_X86_32 + mov r6d, wm +%endif + sub dsq, r6 +.w16_loop0: + movd m3, [maskq+hq*2] + pshuflw m3, m3, q0000 + punpcklqdq m3, m3 + mov wd, r6d +.w16_loop: + mova m1, [dstq] ; a + mova m2, [tmpq] ; b + BLEND_64M m1, m2, m3, m3 + mova [dstq], m0 + add dstq, 16 + add tmpq, 16 + sub wd, 16 + jg .w16_loop + add dstq, dsq + inc hq + jl .w16_loop0 + RET + +; emu_edge args: +; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, +; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, +; const pixel *ref, const ptrdiff_t ref_stride +; +; bw, bh total filled size +; iw, ih, copied block -> fill bottom, right +; x, y, offset in bw/bh -> fill top, left +cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \ + y, dst, dstride, src, sstride, \ + bottomext, rightext, blk + ; we assume that the buffer (stride) is larger than width, so we can + ; safely overwrite by a few bytes + pxor m1, m1 + +%if ARCH_X86_64 + %define reg_zero r12q + %define reg_tmp r10 + %define reg_src srcq + %define reg_bottomext bottomextq + %define reg_rightext rightextq + %define reg_blkm r9m +%else + %define reg_zero r6 + %define reg_tmp r0 + %define reg_src r1 + %define reg_bottomext r0 + %define reg_rightext r1 + %define reg_blkm r2m +%endif + ; + ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + xor reg_zero, reg_zero + lea reg_tmp, [ihq-1] + cmp yq, ihq + cmovs reg_tmp, yq + test yq, yq + cmovs reg_tmp, reg_zero +%if ARCH_X86_64 + imul reg_tmp, sstrideq + add srcq, reg_tmp +%else + imul reg_tmp, sstridem + mov reg_src, srcm + add reg_src, reg_tmp +%endif + ; + ; ref += iclip(x, 0, iw - 1) + lea reg_tmp, [iwq-1] + cmp xq, iwq + cmovs reg_tmp, xq + test xq, xq + cmovs reg_tmp, reg_zero + add reg_src, reg_tmp +%if ARCH_X86_32 + mov srcm, reg_src +%endif + ; + ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) +%if ARCH_X86_32 + mov r1, r1m ; restore bh +%endif + lea reg_bottomext, [yq+bhq] + sub reg_bottomext, ihq + lea r3, [bhq-1] + cmovs reg_bottomext, reg_zero + ; + + DEFINE_ARGS bw, bh, iw, ih, x, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; top_ext = iclip(-y, 0, bh - 1) + neg topextq + cmovs topextq, reg_zero + cmp reg_bottomext, bhq + cmovns reg_bottomext, r3 + cmp topextq, bhq + cmovg topextq, r3 + %if ARCH_X86_32 + mov r4m, reg_bottomext + ; + ; right_ext = iclip(x + bw - iw, 0, bw - 1) + mov r0, r0m ; restore bw + %endif + lea reg_rightext, [xq+bwq] + sub reg_rightext, iwq + lea r2, [bwq-1] + cmovs reg_rightext, reg_zero + + DEFINE_ARGS bw, bh, iw, ih, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; left_ext = iclip(-x, 0, bw - 1) + neg leftextq + cmovs leftextq, reg_zero + cmp reg_rightext, bwq + cmovns reg_rightext, r2 + %if ARCH_X86_32 + mov r3m, r1 + %endif + cmp leftextq, bwq + cmovns leftextq, r2 + +%undef reg_zero +%undef reg_tmp +%undef reg_src +%undef reg_bottomext +%undef reg_rightext + + DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ + topext, dst, dstride, src, sstride, \ + bottomext, rightext, blk + + ; center_h = bh - top_ext - bottom_ext +%if ARCH_X86_64 + lea r3, [bottomextq+topextq] + sub centerhq, r3 +%else + mov r1, centerhm ; restore r1 + sub centerhq, topextq + sub centerhq, r4m + mov r1m, centerhq +%endif + ; + ; blk += top_ext * PXSTRIDE(dst_stride) + mov r2, topextq +%if ARCH_X86_64 + imul r2, dstrideq +%else + mov r6, r6m ; restore dstq + imul r2, dstridem +%endif + add dstq, r2 + mov reg_blkm, dstq ; save pointer for ext + ; + ; center_w = bw - left_ext - right_ext + mov centerwq, bwq +%if ARCH_X86_64 + lea r3, [rightextq+leftextq] + sub centerwq, r3 +%else + sub centerwq, r3m + sub centerwq, leftextq +%endif + +; vloop Macro +%macro v_loop 3 ; need_left_ext, need_right_ext, suffix + %if ARCH_X86_64 + %define reg_tmp r12 + %else + %define reg_tmp r0 + %endif +.v_loop_%3: + %if ARCH_X86_32 + mov r0, r0m + mov r1, r1m + %endif +%if %1 + ; left extension + %if ARCH_X86_64 + movd m0, [srcq] + %else + mov r3, srcm + movd m0, [r3] + %endif + pshufb m0, m1 + xor r3, r3 +.left_loop_%3: + mova [dstq+r3], m0 + add r3, mmsize + cmp r3, leftextq + jl .left_loop_%3 + ; body + lea reg_tmp, [dstq+leftextq] +%endif + xor r3, r3 +.body_loop_%3: + %if ARCH_X86_64 + movu m0, [srcq+r3] + %else + mov r1, srcm + movu m0, [r1+r3] + %endif +%if %1 + movu [reg_tmp+r3], m0 +%else + movu [dstq+r3], m0 +%endif + add r3, mmsize + cmp r3, centerwq + jl .body_loop_%3 +%if %2 + ; right extension +%if %1 + add reg_tmp, centerwq +%else + lea reg_tmp, [dstq+centerwq] +%endif + %if ARCH_X86_64 + movd m0, [srcq+centerwq-1] + %else + mov r3, srcm + movd m0, [r3+centerwq-1] + %endif + pshufb m0, m1 + xor r3, r3 +.right_loop_%3: + movu [reg_tmp+r3], m0 + add r3, mmsize + %if ARCH_X86_64 + cmp r3, rightextq + %else + cmp r3, r3m + %endif + jl .right_loop_%3 +%endif + %if ARCH_X86_64 + add dstq, dstrideq + add srcq, sstrideq + dec centerhq + jg .v_loop_%3 + %else + add dstq, dstridem + mov r0, sstridem + add srcm, r0 + sub dword centerhm, 1 + jg .v_loop_%3 + mov r0, r0m ; restore r0 + %endif +%endmacro ; vloop MACRO + + test leftextq, leftextq + jnz .need_left_ext + %if ARCH_X86_64 + test rightextq, rightextq + jnz .need_right_ext + %else + cmp leftextq, r3m ; leftextq == 0 + jne .need_right_ext + %endif + v_loop 0, 0, 0 + jmp .body_done + + ;left right extensions +.need_left_ext: + %if ARCH_X86_64 + test rightextq, rightextq + %else + mov r3, r3m + test r3, r3 + %endif + jnz .need_left_right_ext + v_loop 1, 0, 1 + jmp .body_done + +.need_left_right_ext: + v_loop 1, 1, 2 + jmp .body_done + +.need_right_ext: + v_loop 0, 1, 3 + +.body_done: +; r0 ; bw +; r1 ;; x loop +; r4 ;; y loop +; r5 ; topextq +; r6 ;dstq +; r7 ;dstrideq +; r8 ; srcq +%if ARCH_X86_64 + %define reg_dstride dstrideq +%else + %define reg_dstride r2 +%endif + ; + ; bottom edge extension + %if ARCH_X86_64 + test bottomextq, bottomextq + jz .top + %else + xor r1, r1 + cmp r1, r4m + je .top + %endif + ; + %if ARCH_X86_64 + mov srcq, dstq + sub srcq, dstrideq + xor r1, r1 + %else + mov r3, dstq + mov reg_dstride, dstridem + sub r3, reg_dstride + mov srcm, r3 + %endif + ; +.bottom_x_loop: + %if ARCH_X86_64 + mova m0, [srcq+r1] + lea r3, [dstq+r1] + mov r4, bottomextq + %else + mov r3, srcm + mova m0, [r3+r1] + lea r3, [dstq+r1] + mov r4, r4m + %endif + ; +.bottom_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .bottom_y_loop + add r1, mmsize + cmp r1, bwq + jl .bottom_x_loop + +.top: + ; top edge extension + test topextq, topextq + jz .end +%if ARCH_X86_64 + mov srcq, reg_blkm +%else + mov r3, reg_blkm + mov reg_dstride, dstridem +%endif + mov dstq, dstm + xor r1, r1 + ; +.top_x_loop: +%if ARCH_X86_64 + mova m0, [srcq+r1] +%else + mov r3, reg_blkm + mova m0, [r3+r1] +%endif + lea r3, [dstq+r1] + mov r4, topextq + ; +.top_y_loop: + mova [r3], m0 + add r3, reg_dstride + dec r4 + jg .top_y_loop + add r1, mmsize + cmp r1, bwq + jl .top_x_loop + +.end: + RET + +%undef reg_dstride +%undef reg_blkm +%undef reg_tmp + +cextern resize_filter + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +%if ARCH_X86_64 +cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 +%elif STACK_ALIGNMENT >= 16 +cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 +%else +cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 +%endif + movifnidn dstq, dstmp + movifnidn srcq, srcmp +%if STACK_ALIGNMENT >= 16 + movifnidn dst_wd, dst_wm +%endif +%if ARCH_X86_64 + movifnidn hd, hm +%endif + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + movd m7, dxm + movd m6, mx0m + movd m5, src_wm + pshufd m7, m7, q0000 + pshufd m6, m6, q0000 + pshufd m5, m5, q0000 + +%if ARCH_X86_64 + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x + LEA r7, $$ +%define base r7-$$ +%else + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x +%define hd dword r5m +%if STACK_ALIGNMENT >= 16 + LEA r6, $$ +%define base r6-$$ +%else + LEA r4, $$ +%define base r4-$$ +%endif +%endif + +%if ARCH_X86_64 + mova m10, [base+pw_m256] + mova m9, [base+pd_63] + mova m8, [base+pb_8x0_8x8] +%else +%define m10 [base+pw_m256] +%define m9 [base+pd_63] +%define m8 [base+pb_8x0_8x8] +%endif + pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] + pslld m7, 2 ; dx*4 + pslld m5, 14 + paddd m6, m4 ; mx+[0..3]*dx + SCRATCH 7, 13, 0 + SCRATCH 6, 12, 1 + SCRATCH 5, 11, 2 + + ; m10 = pmulhrsw constant for x=(x+64)>>7 + ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8 + +.loop_y: + xor xd, xd + mova m0, m12 ; per-line working version of mx + +.loop_x: + pxor m1, m1 + pcmpgtd m1, m0 + pandn m1, m0 + psrad m2, m0, 8 ; filter offset (unmasked) + pcmpgtd m3, m11, m1 + pand m1, m3 + pandn m3, m11 + por m1, m3 + psubd m3, m0, m1 ; pshufb offset + psrad m1, 14 ; clipped src_x offset + psrad m3, 14 ; pshufb edge_emu offset + pand m2, m9 ; filter offset (masked) + + ; load source pixels +%if ARCH_X86_64 + movd r8d, m1 + pshuflw m1, m1, q3232 + movd r9d, m1 + punpckhqdq m1, m1 + movd r10d, m1 + psrlq m1, 32 + movd r11d, m1 + movq m4, [srcq+r8] + movq m5, [srcq+r10] + movhps m4, [srcq+r9] + movhps m5, [srcq+r11] +%else + movd r3d, m1 + pshufd m1, m1, q3312 + movd r1d, m1 + pshuflw m1, m1, q3232 + movq m4, [srcq+r3] + movq m5, [srcq+r1] + movd r3d, m1 + punpckhqdq m1, m1 + movd r1d, m1 + movhps m4, [srcq+r3] + movhps m5, [srcq+r1] +%endif + + ; if no emulation is required, we don't need to shuffle or emulate edges + ; this also saves 2 quasi-vpgatherdqs + pxor m6, m6 + pcmpeqb m6, m3 +%if ARCH_X86_64 + pmovmskb r8d, m6 + cmp r8d, 0xffff +%else + pmovmskb r3d, m6 + cmp r3d, 0xffff +%endif + je .filter + +%if ARCH_X86_64 + movd r8d, m3 + pshuflw m3, m3, q3232 + movd r9d, m3 + punpckhqdq m3, m3 + movd r10d, m3 + psrlq m3, 32 + movd r11d, m3 + movsxd r8, r8d + movsxd r9, r9d + movsxd r10, r10d + movsxd r11, r11d + movq m6, [base+resize_shuf+4+r8] + movq m7, [base+resize_shuf+4+r10] + movhps m6, [base+resize_shuf+4+r9] + movhps m7, [base+resize_shuf+4+r11] +%else + movd r3d, m3 + pshufd m3, m3, q3312 + movd r1d, m3 + pshuflw m3, m3, q3232 + movq m6, [base+resize_shuf+4+r3] + movq m7, [base+resize_shuf+4+r1] + movd r3d, m3 + punpckhqdq m3, m3 + movd r1d, m3 + movhps m6, [base+resize_shuf+4+r3] + movhps m7, [base+resize_shuf+4+r1] +%endif + + paddb m6, m8 + paddb m7, m8 + pshufb m4, m6 + pshufb m5, m7 + +.filter: +%if ARCH_X86_64 + movd r8d, m2 + pshuflw m2, m2, q3232 + movd r9d, m2 + punpckhqdq m2, m2 + movd r10d, m2 + psrlq m2, 32 + movd r11d, m2 + movq m6, [base+resize_filter+r8*8] + movq m7, [base+resize_filter+r10*8] + movhps m6, [base+resize_filter+r9*8] + movhps m7, [base+resize_filter+r11*8] +%else + movd r3d, m2 + pshufd m2, m2, q3312 + movd r1d, m2 + pshuflw m2, m2, q3232 + movq m6, [base+resize_filter+r3*8] + movq m7, [base+resize_filter+r1*8] + movd r3d, m2 + punpckhqdq m2, m2 + movd r1d, m2 + movhps m6, [base+resize_filter+r3*8] + movhps m7, [base+resize_filter+r1*8] +%endif + + pmaddubsw m4, m6 + pmaddubsw m5, m7 + phaddw m4, m5 + phaddsw m4, m4 + pmulhrsw m4, m10 ; x=(x+64)>>7 + packuswb m4, m4 + movd [dstq+xq], m4 + + paddd m0, m13 + add xd, 4 +%if STACK_ALIGNMENT >= 16 + cmp xd, dst_wd +%else + cmp xd, dst_wm +%endif + jl .loop_x + + add dstq, dst_stridemp + add srcq, src_stridemp + dec hd + jg .loop_y + RET + +INIT_XMM ssse3 +PREP_BILIN +PREP_8TAP +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T + +INIT_XMM sse4 +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T + +INIT_XMM sse2 +PREP_BILIN +PREP_8TAP +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T diff --git a/third_party/dav1d/src/x86/msac.asm b/third_party/dav1d/src/x86/msac.asm new file mode 100644 index 0000000000..9f05c921a6 --- /dev/null +++ b/third_party/dav1d/src/x86/msac.asm @@ -0,0 +1,667 @@ +; Copyright © 2019, VideoLAN and dav1d authors +; Copyright © 2019, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 64 ; avoids cacheline splits + +min_prob: dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 +pw_0xff00: times 8 dw 0xff00 +pw_32: times 8 dw 32 + +%if ARCH_X86_64 +%define resp resq +%define movp movq +%define c_shuf q3333 +%macro DECODE_SYMBOL_ADAPT_INIT 0-1 +%endmacro +%else +%define resp resd +%define movp movd +%define c_shuf q1111 +%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok + mov t0, r0m + mov t1, r1m +%if %1 == 0 + mov t2, r2m +%endif +%if STACK_ALIGNMENT >= 16 + sub esp, 40-%1*4 +%else + mov eax, esp + and esp, ~15 + sub esp, 40-%1*4 + mov [esp], eax +%endif +%endmacro +%endif + +struc msac + .buf: resp 1 + .end: resp 1 + .dif: resp 1 + .rng: resd 1 + .cnt: resd 1 + .update_cdf: resd 1 +endstruc + +%define m(x, y) mangle(private_prefix %+ _ %+ x %+ y) + +SECTION .text + +%if WIN64 +DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8 +%define buf rsp+stack_offset+8 ; shadow space +%elif UNIX64 +DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8 +%define buf rsp-40 ; red zone +%else +DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3 +%define buf esp+8 +%endif + +INIT_XMM sse2 +cglobal msac_decode_symbol_adapt4, 0, 6, 6 + DECODE_SYMBOL_ADAPT_INIT + LEA rax, pw_0xff00 + movd m2, [t0+msac.rng] + movq m1, [t1] + movp m3, [t0+msac.dif] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + not t2 ; -(n_symbols + 1) + pshuflw m2, m2, q0000 + movd [buf+12], m2 + pand m2, [rax] + mova m0, m1 + psrlw m1, 6 + psllw m1, 7 + pmulhuw m1, m2 + movq m2, [rax+t2*2] + pshuflw m3, m3, c_shuf + paddw m1, m2 + mova [buf+16], m1 + psubusw m1, m3 + pxor m2, m2 + pcmpeqw m1, m2 ; c >= v + pmovmskb eax, m1 + test t3d, t3d + jz .renorm ; !allow_update_cdf + +; update_cdf: + movzx t3d, word [t1+t4*2] ; count + pcmpeqw m2, m2 + mov t2d, t3d + shr t3d, 4 + cmp t4d, 3 + sbb t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4 + cmp t2d, 32 + adc t2d, 0 ; count + (count < 32) + movd m3, t3d + pavgw m2, m1 ; i >= val ? -1 : 32768 + psubw m2, m0 ; for (i = 0; i < val; i++) + psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate; + psraw m2, m3 ; for (; i < n_symbols; i++) + paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1; + movq [t1], m0 + mov [t1+t4*2], t2w + +.renorm: + tzcnt eax, eax + mov t4, [t0+msac.dif] + movzx t1d, word [buf+rax+16] ; v + movzx t2d, word [buf+rax+14] ; u + shr eax, 1 +.renorm2: +%if ARCH_X86_64 == 0 +%if STACK_ALIGNMENT >= 16 + add esp, 40 +%else + mov esp, [esp] +%endif +%endif + not t4 + sub t2d, t1d ; rng + shl t1, gprsize*8-16 + add t4, t1 ; ~dif +.renorm3: + mov t1d, [t0+msac.cnt] + movifnidn t7, t0 +.renorm4: + bsr ecx, t2d + xor ecx, 15 ; d +.renorm5: + shl t2d, cl + shl t4, cl + mov [t7+msac.rng], t2d + not t4 + sub t1d, ecx + jae .end ; no refill required + +; refill: + mov t2, [t7+msac.buf] + mov rcx, [t7+msac.end] +%if ARCH_X86_64 == 0 + push t5 +%endif + lea t5, [t2+gprsize] + cmp t5, rcx + ja .refill_eob + mov t2, [t2] + lea ecx, [t1+23] + add t1d, 16 + shr ecx, 3 ; shift_bytes + bswap t2 + sub t5, rcx + shl ecx, 3 ; shift_bits + shr t2, cl + sub ecx, t1d ; shift_bits - 16 - cnt + mov t1d, gprsize*8-16 + shl t2, cl + mov [t7+msac.buf], t5 + sub t1d, ecx ; cnt + gprsize*8 - shift_bits + xor t4, t2 +%if ARCH_X86_64 == 0 + pop t5 +%endif +.end: + mov [t7+msac.cnt], t1d + mov [t7+msac.dif], t4 + RET +.refill_eob: ; avoid overreading the input buffer + mov t5, rcx + mov ecx, gprsize*8-24 + sub ecx, t1d ; c +.refill_eob_loop: + cmp t2, t5 + jae .refill_eob_end ; eob reached + movzx t1d, byte [t2] + inc t2 + shl t1, cl + xor t4, t1 + sub ecx, 8 + jge .refill_eob_loop +.refill_eob_end: + mov t1d, gprsize*8-24 +%if ARCH_X86_64 == 0 + pop t5 +%endif + sub t1d, ecx + mov [t7+msac.buf], t2 + mov [t7+msac.dif], t4 + mov [t7+msac.cnt], t1d + RET + +cglobal msac_decode_symbol_adapt8, 0, 6, 6 + DECODE_SYMBOL_ADAPT_INIT + LEA rax, pw_0xff00 + movd m2, [t0+msac.rng] + mova m1, [t1] + movp m3, [t0+msac.dif] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + not t2 + pshuflw m2, m2, q0000 + movd [buf+12], m2 + punpcklqdq m2, m2 + mova m0, m1 + psrlw m1, 6 + pand m2, [rax] + psllw m1, 7 + pmulhuw m1, m2 + movu m2, [rax+t2*2] + pshuflw m3, m3, c_shuf + paddw m1, m2 + punpcklqdq m3, m3 + mova [buf+16], m1 + psubusw m1, m3 + pxor m2, m2 + pcmpeqw m1, m2 + pmovmskb eax, m1 + test t3d, t3d + jz m(msac_decode_symbol_adapt4, SUFFIX).renorm + movzx t3d, word [t1+t4*2] + pcmpeqw m2, m2 + mov t2d, t3d + shr t3d, 4 + cmp t4d, 3 ; may be called with n_symbols <= 2 + sbb t3d, -5 + cmp t2d, 32 + adc t2d, 0 + movd m3, t3d + pavgw m2, m1 + psubw m2, m0 + psubw m0, m1 + psraw m2, m3 + paddw m0, m2 + mova [t1], m0 + mov [t1+t4*2], t2w + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm + +cglobal msac_decode_symbol_adapt16, 0, 6, 6 + DECODE_SYMBOL_ADAPT_INIT + LEA rax, pw_0xff00 + movd m4, [t0+msac.rng] + mova m2, [t1] + mova m3, [t1+16] + movp m5, [t0+msac.dif] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + not t2 +%if WIN64 + sub rsp, 48 ; need 36 bytes, shadow space is only 32 +%endif + pshuflw m4, m4, q0000 + movd [buf-4], m4 + punpcklqdq m4, m4 + mova m0, m2 + psrlw m2, 6 + mova m1, m3 + psrlw m3, 6 + pand m4, [rax] + psllw m2, 7 + psllw m3, 7 + pmulhuw m2, m4 + pmulhuw m3, m4 + movu m4, [rax+t2*2] + pshuflw m5, m5, c_shuf + paddw m2, m4 + psubw m4, [rax-pw_0xff00+pw_32] + punpcklqdq m5, m5 + paddw m3, m4 + mova [buf], m2 + psubusw m2, m5 + mova [buf+16], m3 + psubusw m3, m5 + pxor m4, m4 + pcmpeqw m2, m4 + pcmpeqw m3, m4 + packsswb m5, m2, m3 + pmovmskb eax, m5 + test t3d, t3d + jz .renorm + movzx t3d, word [t1+t4*2] + pcmpeqw m4, m4 + mova m5, m4 + lea t2d, [t3+80] ; only support n_symbols > 2 + shr t2d, 4 + cmp t3d, 32 + adc t3d, 0 + pavgw m4, m2 + pavgw m5, m3 + psubw m4, m0 + psubw m0, m2 + movd m2, t2d + psubw m5, m1 + psubw m1, m3 + psraw m4, m2 + psraw m5, m2 + paddw m0, m4 + paddw m1, m5 + mova [t1], m0 + mova [t1+16], m1 + mov [t1+t4*2], t3w +.renorm: + tzcnt eax, eax + mov t4, [t0+msac.dif] + movzx t1d, word [buf+rax*2] + movzx t2d, word [buf+rax*2-2] +%if WIN64 + add rsp, 48 +%endif + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2 + +cglobal msac_decode_bool_adapt, 0, 6, 0 + movifnidn t1, r1mp + movifnidn t0, r0mp + movzx eax, word [t1] + movzx t3d, byte [t0+msac.rng+1] + mov t4, [t0+msac.dif] + mov t2d, [t0+msac.rng] +%if ARCH_X86_64 + mov t5d, eax +%endif + and eax, ~63 + imul eax, t3d +%if UNIX64 + mov t6, t4 +%endif + shr eax, 7 + add eax, 4 ; v + mov t3d, eax + shl rax, gprsize*8-16 ; vw + sub t2d, t3d ; r - v + sub t4, rax ; dif - vw + setb al + cmovb t2d, t3d + mov t3d, [t0+msac.update_cdf] +%if UNIX64 + cmovb t4, t6 +%else + cmovb t4, [t0+msac.dif] +%endif +%if ARCH_X86_64 == 0 + movzx eax, al +%endif + not t4 + test t3d, t3d + jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3 +%if UNIX64 == 0 + push t6 +%endif + movzx t6d, word [t1+2] +%if ARCH_X86_64 == 0 + push t5 + movzx t5d, word [t1] +%endif + movifnidn t7, t0 + lea ecx, [t6+64] + cmp t6d, 32 + adc t6d, 0 + mov [t1+2], t6w + imul t6d, eax, -32769 + shr ecx, 4 ; rate + add t6d, t5d ; if (bit) + sub t5d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1; + sar t6d, cl ; else + sub t5d, t6d ; cdf[0] -= cdf[0] >> rate; + mov [t1], t5w +%if WIN64 + mov t1d, [t7+msac.cnt] + pop t6 + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4 +%else +%if ARCH_X86_64 == 0 + pop t5 + pop t6 +%endif + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 +%endif + +cglobal msac_decode_bool_equi, 0, 6, 0 + movifnidn t0, r0mp + mov t1d, [t0+msac.rng] + mov t4, [t0+msac.dif] + mov t2d, t1d + mov t1b, 8 + mov t3, t4 + mov eax, t1d + shr t1d, 1 ; v + shl rax, gprsize*8-17 ; vw + sub t2d, t1d ; r - v + sub t4, rax ; dif - vw + cmovb t2d, t1d + mov t1d, [t0+msac.cnt] + cmovb t4, t3 + movifnidn t7, t0 + mov ecx, 0xbfff + setb al ; the upper 32 bits contains garbage but that's OK + sub ecx, t2d + not t4 + ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14) + ; i.e. (0 <= d <= 2) and v < (3 << 14) + shr ecx, 14 ; d +%if ARCH_X86_64 == 0 + movzx eax, al +%endif + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5 + +cglobal msac_decode_bool, 0, 6, 0 + movifnidn t0, r0mp + movifnidn t1d, r1m + movzx eax, byte [t0+msac.rng+1] ; r >> 8 + mov t4, [t0+msac.dif] + mov t2d, [t0+msac.rng] + and t1d, ~63 + imul eax, t1d + mov t3, t4 + shr eax, 7 + add eax, 4 ; v + mov t1d, eax + shl rax, gprsize*8-16 ; vw + sub t2d, t1d ; r - v + sub t4, rax ; dif - vw + cmovb t2d, t1d + cmovb t4, t3 + setb al + not t4 +%if ARCH_X86_64 == 0 + movzx eax, al +%endif + jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 + +%macro HI_TOK 1 ; update_cdf +%if ARCH_X86_64 == 0 + mov eax, -24 +%endif +%%loop: +%if %1 + movzx t2d, word [t1+3*2] +%endif + mova m1, m0 + pshuflw m2, m2, q0000 + psrlw m1, 6 + movd [buf+12], m2 + pand m2, m4 + psllw m1, 7 + pmulhuw m1, m2 +%if ARCH_X86_64 == 0 + add eax, 5 + mov [buf+8], eax +%endif + pshuflw m3, m3, c_shuf + paddw m1, m5 + movq [buf+16], m1 + psubusw m1, m3 + pxor m2, m2 + pcmpeqw m1, m2 + pmovmskb eax, m1 +%if %1 + lea ecx, [t2+80] + pcmpeqw m2, m2 + shr ecx, 4 + cmp t2d, 32 + adc t2d, 0 + movd m3, ecx + pavgw m2, m1 + psubw m2, m0 + psubw m0, m1 + psraw m2, m3 + paddw m0, m2 + movq [t1], m0 + mov [t1+3*2], t2w +%endif + tzcnt eax, eax + movzx ecx, word [buf+rax+16] + movzx t2d, word [buf+rax+14] + not t4 +%if ARCH_X86_64 + add t6d, 5 +%endif + sub eax, 5 ; setup for merging the tok_br and tok branches + sub t2d, ecx + shl rcx, gprsize*8-16 + add t4, rcx + bsr ecx, t2d + xor ecx, 15 + shl t2d, cl + shl t4, cl + movd m2, t2d + mov [t7+msac.rng], t2d + not t4 + sub t5d, ecx + jae %%end + mov t2, [t7+msac.buf] + mov rcx, [t7+msac.end] +%if UNIX64 == 0 + push t8 +%endif + lea t8, [t2+gprsize] + cmp t8, rcx + ja %%refill_eob + mov t2, [t2] + lea ecx, [t5+23] + add t5d, 16 + shr ecx, 3 + bswap t2 + sub t8, rcx + shl ecx, 3 + shr t2, cl + sub ecx, t5d + mov t5d, gprsize*8-16 + shl t2, cl + mov [t7+msac.buf], t8 +%if UNIX64 == 0 + pop t8 +%endif + sub t5d, ecx + xor t4, t2 +%%end: + movp m3, t4 +%if ARCH_X86_64 + add t6d, eax ; CF = tok_br < 3 || tok == 15 + jnc %%loop + lea eax, [t6+30] +%else + add eax, [buf+8] + jnc %%loop + add eax, 30 +%if STACK_ALIGNMENT >= 16 + add esp, 36 +%else + mov esp, [esp] +%endif +%endif + mov [t7+msac.dif], t4 + shr eax, 1 + mov [t7+msac.cnt], t5d + RET +%%refill_eob: + mov t8, rcx + mov ecx, gprsize*8-24 + sub ecx, t5d +%%refill_eob_loop: + cmp t2, t8 + jae %%refill_eob_end + movzx t5d, byte [t2] + inc t2 + shl t5, cl + xor t4, t5 + sub ecx, 8 + jge %%refill_eob_loop +%%refill_eob_end: +%if UNIX64 == 0 + pop t8 +%endif + mov t5d, gprsize*8-24 + mov [t7+msac.buf], t2 + sub t5d, ecx + jmp %%end +%endmacro + +cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6 + DECODE_SYMBOL_ADAPT_INIT 1 +%if ARCH_X86_64 == 0 && PIC + LEA t2, min_prob+12*2 + %define base t2-(min_prob+12*2) +%else + %define base 0 +%endif + movq m0, [t1] + movd m2, [t0+msac.rng] + mov eax, [t0+msac.update_cdf] + movq m4, [base+pw_0xff00] + movp m3, [t0+msac.dif] + movq m5, [base+min_prob+12*2] + mov t4, [t0+msac.dif] + mov t5d, [t0+msac.cnt] +%if ARCH_X86_64 + mov t6d, -24 +%endif + movifnidn t7, t0 + test eax, eax + jz .no_update_cdf + HI_TOK 1 +.no_update_cdf: + HI_TOK 0 + +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal msac_decode_symbol_adapt16, 3, 6, 6 + lea rax, [pw_0xff00] + vpbroadcastw m2, [t0+msac.rng] + mova m0, [t1] + vpbroadcastw m3, [t0+msac.dif+6] + vbroadcasti128 m4, [rax] + mov t3d, [t0+msac.update_cdf] + mov t4d, t2d + not t2 + mov r5, rsp +%if WIN64 + and rsp, ~31 + sub rsp, 40 +%else + and r5, ~31 + %define buf r5-32 +%endif + psrlw m1, m0, 6 + movd [buf-4], xm2 + pand m2, m4 + psllw m1, 7 + pmulhuw m1, m2 + paddw m1, [rax+t2*2] + mova [buf], m1 + pmaxuw m1, m3 + pcmpeqw m1, m3 + pmovmskb eax, m1 + test t3d, t3d + jz .renorm + movzx t3d, word [t1+t4*2] + pcmpeqw m2, m2 + lea t2d, [t3+80] + shr t2d, 4 + cmp t3d, 32 + adc t3d, 0 + movd xm3, t2d + pavgw m2, m1 + psubw m2, m0 + psubw m0, m1 + psraw m2, xm3 + paddw m0, m2 + mova [t1], m0 + mov [t1+t4*2], t3w +.renorm: + tzcnt eax, eax + mov t4, [t0+msac.dif] + movzx t1d, word [buf+rax-0] + movzx t2d, word [buf+rax-2] + shr eax, 1 +%if WIN64 + mov rsp, r5 +%endif + vzeroupper + jmp m(msac_decode_symbol_adapt4, _sse2).renorm2 +%endif diff --git a/third_party/dav1d/src/x86/msac.h b/third_party/dav1d/src/x86/msac.h new file mode 100644 index 0000000000..0bb632fb31 --- /dev/null +++ b/third_party/dav1d/src/x86/msac.h @@ -0,0 +1,75 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_X86_MSAC_H +#define DAV1D_SRC_X86_MSAC_H + +#include "src/cpu.h" + +unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); +unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf); +unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s); +unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f); +unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf); + +#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2 +#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2 +#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_sse2 +#endif + +#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_sse2 +#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_sse2 +#define dav1d_msac_decode_bool dav1d_msac_decode_bool_sse2 + +#if ARCH_X86_64 +#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb)) + +static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (flags & DAV1D_X86_CPU_FLAG_SSE2) { + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; + } + + if (flags & DAV1D_X86_CPU_FLAG_AVX2) { + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; + } +} + +#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2 +#endif + +#endif /* DAV1D_SRC_X86_MSAC_H */ diff --git a/third_party/dav1d/src/x86/refmvs.asm b/third_party/dav1d/src/x86/refmvs.asm new file mode 100644 index 0000000000..06f555db11 --- /dev/null +++ b/third_party/dav1d/src/x86/refmvs.asm @@ -0,0 +1,688 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 64 + +%macro JMP_TABLE 2-* + %xdefine %%prefix mangle(private_prefix %+ _%1) + %1_table: + %xdefine %%base %1_table + %rep %0 - 1 + dd %%prefix %+ .w%2 - %%base + %rotate 1 + %endrep +%endmacro + +%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix + %rep %1 + db %2*3 + db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \ + mangle(private_prefix %+ _save_tmvs_%3).write1 + %endrep +%endmacro + +%if ARCH_X86_64 +splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 + db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 + db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 +%endif +save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0 + db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 +save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2 + db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3 +save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1 +cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3 +save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00 +save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00 +pb_128: times 16 db 128 + +save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3 + SAVE_TMVS_TABLE 4, 8, ssse3 + SAVE_TMVS_TABLE 4, 4, ssse3 + SAVE_TMVS_TABLE 5, 2, ssse3 + SAVE_TMVS_TABLE 7, 1, ssse3 + +%if ARCH_X86_64 +save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2 + SAVE_TMVS_TABLE 4, 8, avx2 + SAVE_TMVS_TABLE 4, 4, avx2 + SAVE_TMVS_TABLE 5, 2, avx2 + SAVE_TMVS_TABLE 7, 1, avx2 + +save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl + SAVE_TMVS_TABLE 4, 8, avx512icl + SAVE_TMVS_TABLE 4, 4, avx512icl + SAVE_TMVS_TABLE 5, 2, avx512icl + SAVE_TMVS_TABLE 7, 1, avx512icl + +JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 +JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 +%endif + +JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 + +SECTION .text + +%macro movif32 2 +%if ARCH_X86_32 + mov %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +; refmvs_temporal_block *rp, ptrdiff_t stride, +; refmvs_block **rr, uint8_t *ref_sign, +; int col_end8, int row_end8, int col_start8, int row_start8 +%if ARCH_X86_64 +cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart +%define base_reg r12 +%else +cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart + movq m5, [ref_signq] + lea strided, [strided*5] + mov stridem, strided + mov r3, xstartm + mov r1, ystartm + DEFINE_ARGS b, ystart, rr, cand, xend, x +%define stridemp r1m +%define m8 [base+pb_128] +%define m9 [base+save_pack0+ 0] +%define m10 [base+save_pack0+16] +%define base_reg r6 +%endif +%define base base_reg-.write1 + LEA base_reg, .write1 +%if ARCH_X86_64 + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm + movq m5, [ref_signq] +%endif + movu m4, [base+save_ref_shuf] + movddup m6, [base+save_cond0] + movddup m7, [base+save_cond1] +%if ARCH_X86_64 + mova m8, [base+pb_128] + mova m9, [base+save_pack0+ 0] + mova m10, [base+save_pack0+16] +%endif + psllq m5, 8 +%if ARCH_X86_64 + lea r9d, [xendq*5] + lea xstartd, [xstartq*5] + sub yendd, ystartd + add ystartd, ystartd + lea strideq, [strideq*5] + sub xstartq, r9 + add xendd, r9d + add rpq, r9 + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand +%else + lea r0, [xendd*5] ; xend5 + lea r3, [r3*5] ; xstart5 + sub r3, r0 ; -w5 + mov r6m, r3 +%define xstartq r6m + add xendd, r0 ; xend6 + add r0m, r0 ; rp+xend5 + mov xendm, xendd + sub r5, r1 ; h + add r1, r1 + mov r7m, r1 + mov r5m, r5 +%define hd r5mp + jmp .loop_y_noload +%endif +.loop_y: + movif32 ystartd, r7m + movif32 xendd, xendm +.loop_y_noload: + and ystartd, 30 + mov xq, xstartq + mov bq, [rrq+ystartq*gprsize] + add ystartd, 2 + movif32 r7m, ystartd + lea bq, [bq+xendq*4] +.loop_x: +%if ARCH_X86_32 +%define rpq r3 +%define r10 r1 +%define r10d r1 +%define r11 r4 +%define r11d r4 +%endif + imul candq, xq, 0x9999 ; x / 5 * 3 + sar candq, 16 + movzx r10d, byte [bq+candq*8+22] ; cand_b->bs + movu m0, [bq+candq*8+12] ; cand_b + movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0] + movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1] + add r10, base_reg + add candq, r11 + jge .calc + movu m1, [bq+candq*8+12] + movzx r11d, byte [bq+candq*8+22] + movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1] + add r11, base_reg +.calc: + movif32 rpq, r0m + ; ref check + punpckhqdq m2, m0, m1 + pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ... + pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1] + ; mv check + punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ... + pabsw m2, m2 + psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 + ; res + pcmpgtd m3, m2 + pshufd m2, m3, q2301 + pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ... + pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ... + por m3, m2 ; b0.shuf b1.shuf | ... + pxor m3, m8 ; if cond0|cond1 == 0 => zero out + pshufb m0, m3 + pshufb m1, m3 + call r10 + jge .next_line + pshufd m0, m1, q3232 + call r11 + jl .loop_x +.next_line: + add rpq, stridemp + movif32 r0m, rpq + dec hd + jg .loop_y + RET +.write1: + movd [rpq+xq+0], m0 + psrlq m0, 8 + movd [rpq+xq+1], m0 + add xq, 5*1 + ret +.write2: + movq [rpq+xq+0], m0 + psrlq m0, 8 + movd [rpq+xq+6], m0 + add xq, 5*2 + ret +.write4: + pshufb m0, m9 + movu [rpq+xq+ 0], m0 + psrlq m0, 8 + movd [rpq+xq+16], m0 + add xq, 5*4 + ret +.write8: + pshufb m2, m0, m9 + movu [rpq+xq+ 0], m2 + pshufb m0, m10 + movu [rpq+xq+16], m0 + psrldq m2, 2 + movq [rpq+xq+32], m2 + add xq, 5*8 + ret +.write16: + pshufb m2, m0, m9 + movu [rpq+xq+ 0], m2 + pshufb m0, m10 + movu [rpq+xq+16], m0 + shufps m2, m0, q1032 + movu [rpq+xq+48], m2 + shufps m2, m0, q2121 + movu [rpq+xq+32], m2 + shufps m0, m2, q1032 + movu [rpq+xq+64], m0 + add xq, 5*16 + ret + +INIT_XMM sse2 +; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4 +cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 + add bx4d, bw4d + tzcnt bw4d, bw4d + mova m2, [aq] + LEA aq, splat_mv_sse2_table + lea bx4q, [bx4q*3-32] + movsxd bw4q, [aq+bw4q*4] + movifnidn bh4d, bh4m + pshufd m0, m2, q0210 + pshufd m1, m2, q1021 + pshufd m2, m2, q2102 + add bw4q, aq +.loop: + mov aq, [rrq] + add rrq, gprsize + lea aq, [aq+bx4q*4] + jmp bw4q +.w32: + mova [aq-16*16], m0 + mova [aq-16*15], m1 + mova [aq-16*14], m2 + mova [aq-16*13], m0 + mova [aq-16*12], m1 + mova [aq-16*11], m2 + mova [aq-16*10], m0 + mova [aq-16* 9], m1 + mova [aq-16* 8], m2 + mova [aq-16* 7], m0 + mova [aq-16* 6], m1 + mova [aq-16* 5], m2 +.w16: + mova [aq-16* 4], m0 + mova [aq-16* 3], m1 + mova [aq-16* 2], m2 + mova [aq-16* 1], m0 + mova [aq+16* 0], m1 + mova [aq+16* 1], m2 +.w8: + mova [aq+16* 2], m0 + mova [aq+16* 3], m1 + mova [aq+16* 4], m2 +.w4: + mova [aq+16* 5], m0 + mova [aq+16* 6], m1 + mova [aq+16* 7], m2 + dec bh4d + jg .loop + RET +.w2: + movu [aq+104], m0 + movq [aq+120], m1 + dec bh4d + jg .loop + RET +.w1: + movq [aq+116], m0 + movd [aq+124], m2 + dec bh4d + jg .loop + RET + +%if ARCH_X86_64 +INIT_YMM avx2 +; refmvs_temporal_block *rp, ptrdiff_t stride, +; refmvs_block **rr, uint8_t *ref_sign, +; int col_end8, int row_end8, int col_start8, int row_start8 +cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart +%define base r12-.write1 + lea r12, [.write1] + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm + vpbroadcastq m4, [ref_signq] + vpbroadcastq m3, [base+save_ref_shuf+8] + vpbroadcastq m5, [base+save_cond0] + vpbroadcastq m6, [base+save_cond1] + vpbroadcastd m7, [base+pb_128] + mova m8, [base+save_pack0] + mova m9, [base+save_pack1] + psllq m4, 8 + lea r9d, [xendq*5] + lea xstartd, [xstartq*5] + sub yendd, ystartd + add ystartd, ystartd + lea strideq, [strideq*5] + sub xstartq, r9 + add xendd, r9d + add rpq, r9 + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand +.loop_y: + and ystartd, 30 + mov xq, xstartq + mov bq, [rrq+ystartq*8] + add ystartd, 2 + lea bq, [bq+xendq*4] +.loop_x: + imul candq, xq, 0x9999 + sar candq, 16 ; x / 5 * 3 + movzx r10d, byte [bq+candq*8+22] ; cand_b->bs + movu xm0, [bq+candq*8+12] ; cand_b + movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0] + movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1] + add r10, r12 + add candq, r11 + jge .calc + vinserti128 m0, [bq+candq*8+12], 1 + movzx r11d, byte [bq+candq*8+22] + movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1] + add r11, r12 +.calc: + pshufb m1, m0, m3 + pabsw m2, m0 + pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] + psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 + pcmpgtd m1, m2 + pshufd m2, m1, q2301 + pand m1, m5 ; b0.cond0 b1.cond0 + pand m2, m6 ; b0.cond1 b1.cond1 + por m1, m2 ; b0.shuf b1.shuf + pxor m1, m7 ; if cond0|cond1 == 0 => zero out + pshufb m0, m1 + call r10 + jge .next_line + vextracti128 xm0, m0, 1 + call r11 + jl .loop_x +.next_line: + add rpq, strideq + dec hd + jg .loop_y + RET +.write1: + movd [rpq+xq+ 0], xm0 + pextrb [rpq+xq+ 4], xm0, 4 + add xq, 5*1 + ret +.write2: + movq [rpq+xq+0], xm0 + psrlq xm1, xm0, 8 + movd [rpq+xq+6], xm1 + add xq, 5*2 + ret +.write4: + pshufb xm1, xm0, xm8 + movu [rpq+xq+ 0], xm1 + psrlq xm1, 8 + movd [rpq+xq+16], xm1 + add xq, 5*4 + ret +.write8: + vinserti128 m1, m0, xm0, 1 + pshufb m1, m8 + movu [rpq+xq+ 0], m1 + psrldq xm1, 2 + movq [rpq+xq+32], xm1 + add xq, 5*8 + ret +.write16: + vinserti128 m1, m0, xm0, 1 + pshufb m2, m1, m8 + movu [rpq+xq+ 0], m2 + pshufb m1, m9 + movu [rpq+xq+32], m1 + shufps xm2, xm1, q1021 + movu [rpq+xq+64], xm2 + add xq, 5*16 + ret + +cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 + add bx4d, bw4d + tzcnt bw4d, bw4d + vbroadcasti128 m0, [aq] + lea aq, [splat_mv_avx2_table] + lea bx4q, [bx4q*3-32] + movsxd bw4q, [aq+bw4q*4] + pshufb m0, [splat_mv_shuf] + movifnidn bh4d, bh4m + pshufd m1, m0, q2102 + pshufd m2, m0, q1021 + add bw4q, aq +.loop: + mov aq, [rrq] + add rrq, gprsize + lea aq, [aq+bx4q*4] + jmp bw4q +.w32: + mova [aq-32*8], m0 + mova [aq-32*7], m1 + mova [aq-32*6], m2 + mova [aq-32*5], m0 + mova [aq-32*4], m1 + mova [aq-32*3], m2 +.w16: + mova [aq-32*2], m0 + mova [aq-32*1], m1 + mova [aq+32*0], m2 +.w8: + mova [aq+32*1], m0 + mova [aq+32*2], m1 + mova [aq+32*3], m2 + dec bh4d + jg .loop + RET +.w4: + movu [aq+ 80], m0 + mova [aq+112], xm1 + dec bh4d + jg .loop + RET +.w2: + movu [aq+104], xm0 + movq [aq+120], xm2 + dec bh4d + jg .loop + RET +.w1: + movq [aq+116], xm0 + movd [aq+124], xm1 + dec bh4d + jg .loop + RET + +INIT_ZMM avx512icl +; refmvs_temporal_block *rp, ptrdiff_t stride, +; refmvs_block **rr, uint8_t *ref_sign, +; int col_end8, int row_end8, int col_start8, int row_start8 +cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart +%define base r14-.write1 + lea r14, [.write1] + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm + psllq m4, [ref_signq]{bcstq}, 8 + vpbroadcastq m3, [base+save_ref_shuf+8] + vbroadcasti32x4 m5, [base+cond_shuf512] + vbroadcasti32x4 m6, [base+save_cond0] + vpbroadcastd m7, [base+pb_128] + mova m8, [base+save_pack0] + movu xm9, [base+save_pack0+4] + lea r9d, [xendq*5] + lea xstartd, [xstartq*5] + sub yendd, ystartd + add ystartd, ystartd + lea strideq, [strideq*5] + sub xstartq, r9 + add xendd, r9d + add rpq, r9 + mov r10d, 0x1f + kmovb k2, r10d + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand +.loop_y: + and ystartd, 30 + mov xq, xstartq + mov bq, [rrq+ystartq*8] + add ystartd, 2 + lea bq, [bq+xendq*4] +.loop_x: + imul candq, xq, 0x9999 + sar candq, 16 ; x / 5 * 3 + movzx r10d, byte [bq+candq*8+22] ; cand_b->bs + movu xm0, [bq+candq*8+12] ; cand_b + movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0] + movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1] + add r10, r14 + add candq, r11 + jge .calc + movzx r11d, byte [bq+candq*8+22] + vinserti32x4 ym0, [bq+candq*8+12], 1 + movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0] + movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1] + add r11, r14 + add candq, r12 + jge .calc + movzx r12d, byte [bq+candq*8+22] + vinserti32x4 m0, [bq+candq*8+12], 2 + movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0] + movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1] + add r12, r14 + add candq, r13 + jge .calc + vinserti32x4 m0, [bq+candq*8+12], 3 + movzx r13d, byte [bq+candq*8+22] + movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1] + add r13, r14 +.calc: + pshufb m1, m0, m3 + pabsw m2, m0 + pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] + psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 + psubd m2, m1 + pshufb m2, m5 ; c0 c1 c1 c0 + pand m2, m6 + punpckhqdq m1, m2, m2 + vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80 + pshufb m2, m0, m1 + mova xm0, xm2 + call r10 + jge .next_line + vextracti32x4 xm0, m2, 1 + call r11 + jge .next_line + vextracti32x4 xm0, m2, 2 + call r12 + jge .next_line + vextracti32x4 xm0, m2, 3 + call r13 + jl .loop_x +.next_line: + add rpq, strideq + dec hd + jg .loop_y + RET +.write1: + vmovdqu8 [rpq+xq]{k2}, xm0 + add xq, 5*1 + ret +.write2: + pshufb xm0, xm8 + vmovdqu16 [rpq+xq]{k2}, xm0 + add xq, 5*2 + ret +.write4: + vpermb ym0, ym8, ym0 + vmovdqu32 [rpq+xq]{k2}, ym0 + add xq, 5*4 + ret +.write8: + vpermb m0, m8, m0 + vmovdqu64 [rpq+xq]{k2}, m0 + add xq, 5*8 + ret +.write16: + vpermb m1, m8, m0 + movu [rpq+xq+ 0], m1 + pshufb xm0, xm9 + movu [rpq+xq+64], xm0 + add xq, 5*16 + ret + +INIT_ZMM avx512icl +cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 + vbroadcasti32x4 m0, [aq] + lea r1, [splat_mv_avx512icl_table] + tzcnt bw4d, bw4d + lea bx4d, [bx4q*3] + pshufb m0, [splat_mv_shuf] + movsxd bw4q, [r1+bw4q*4] + mov r6d, bh4m + add bw4q, r1 + lea rrq, [rrq+r6*8] + mov r1d, 0x3f + neg r6 + kmovb k1, r1d + jmp bw4q +.w1: + mov r1, [rrq+r6*8] + vmovdqu16 [r1+bx4q*4]{k1}, xm0 + inc r6 + jl .w1 + RET +.w2: + mov r1, [rrq+r6*8] + vmovdqu32 [r1+bx4q*4]{k1}, ym0 + inc r6 + jl .w2 + RET +.w4: + mov r1, [rrq+r6*8] + vmovdqu64 [r1+bx4q*4]{k1}, m0 + inc r6 + jl .w4 + RET +.w8: + pshufd ym1, ym0, q1021 +.w8_loop: + mov r1, [rrq+r6*8+0] + mov r3, [rrq+r6*8+8] + movu [r1+bx4q*4+ 0], m0 + mova [r1+bx4q*4+64], ym1 + movu [r3+bx4q*4+ 0], m0 + mova [r3+bx4q*4+64], ym1 + add r6, 2 + jl .w8_loop + RET +.w16: + pshufd m1, m0, q1021 + pshufd m2, m0, q2102 +.w16_loop: + mov r1, [rrq+r6*8+0] + mov r3, [rrq+r6*8+8] + mova [r1+bx4q*4+64*0], m0 + mova [r1+bx4q*4+64*1], m1 + mova [r1+bx4q*4+64*2], m2 + mova [r3+bx4q*4+64*0], m0 + mova [r3+bx4q*4+64*1], m1 + mova [r3+bx4q*4+64*2], m2 + add r6, 2 + jl .w16_loop + RET +.w32: + pshufd m1, m0, q1021 + pshufd m2, m0, q2102 +.w32_loop: + mov r1, [rrq+r6*8] + lea r1, [r1+bx4q*4] + mova [r1+64*0], m0 + mova [r1+64*1], m1 + mova [r1+64*2], m2 + mova [r1+64*3], m0 + mova [r1+64*4], m1 + mova [r1+64*5], m2 + inc r6 + jl .w32_loop + RET +%endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/refmvs.h b/third_party/dav1d/src/x86/refmvs.h new file mode 100644 index 0000000000..9dafa78b13 --- /dev/null +++ b/third_party/dav1d/src/x86/refmvs.h @@ -0,0 +1,61 @@ +/* + * Copyright © 2021, VideoLAN and dav1d authors + * Copyright © 2021, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" +#include "src/refmvs.h" + +decl_save_tmvs_fn(dav1d_save_tmvs_ssse3); +decl_save_tmvs_fn(dav1d_save_tmvs_avx2); +decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl); + +decl_splat_mv_fn(dav1d_splat_mv_sse2); +decl_splat_mv_fn(dav1d_splat_mv_avx2); +decl_splat_mv_fn(dav1d_splat_mv_avx512icl); + +static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; + + c->splat_mv = dav1d_splat_mv_sse2; + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->save_tmvs = dav1d_save_tmvs_ssse3; + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->save_tmvs = dav1d_save_tmvs_avx2; + c->splat_mv = dav1d_splat_mv_avx2; + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->save_tmvs = dav1d_save_tmvs_avx512icl; + c->splat_mv = dav1d_splat_mv_avx512icl; +#endif +} -- cgit v1.2.3